List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4
public static final String unescapeHtml4(final String input)
Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.
From source file:io.github.karols.hocr4j.dom.HocrTag.java
/** * Creates a new tag with the given elements * and with name and attributes based on the given contents of the opening tag. * @param openingTagString contents of the opening tag * @param contents elements in the body of the tag */// www.j a v a 2s. c o m public HocrTag(String openingTagString, List<HocrElement> contents) { final String x = openingTagString; int i = 1; while (x.charAt(i) == ' ') i++; int nameStart = i; while (x.charAt(i) != ' ' && x.charAt(i) != '>' && x.charAt(i) != '/') { i++; } name = x.substring(nameStart, i).toLowerCase(Locale.US); while (x.charAt(i) == ' ') i++; HashMap<String, String> attributes = new HashMap<String, String>(); while (x.charAt(i) != '/' && x.charAt(i) != '>') { int attrNameStart = i; ing_bad: while (true) { switch (x.charAt(i)) { case '=': case '/': case ' ': case '>': break ing_bad; default: i++; } } String attrName = x.substring(attrNameStart, i); while (x.charAt(i) == ' ') i++; String attrValue = attrName; if (x.charAt(i) == '=') { i++; while (x.charAt(i) == ' ') i++; int attrValueStart = i; switch (x.charAt(i)) { case '\'': attrValueStart++; i++; while (x.charAt(i) != '\'') i++; attrValue = x.substring(attrValueStart, i); i++; break; case '\"': attrValueStart++; i++; while (x.charAt(i) != '\"') i++; attrValue = x.substring(attrValueStart, i); i++; break; default: while (x.charAt(i) != ' ' && x.charAt(i) != '/' && x.charAt(i) != '>') i++; attrValue = x.substring(attrValueStart, i); break; } } while (x.charAt(i) == ' ') i++; attributes.put(StringEscapeUtils.unescapeHtml4(attrName), StringEscapeUtils.unescapeHtml4(attrValue)); } this.id = attributes.get("id"); this.clazz = attributes.get("class"); this.title = attributes.get("title"); this.attributes = ImmutableMap.copyOf(attributes); this.elements = ImmutableList.copyOf(contents); }
From source file:com.silverpeas.util.EncodeHelper.java
/** * This method transforms a text with caracter specificly encoded for HTML by a text encoded in * according to the Java code.//w w w . jav a 2 s . com * * @param text (String) a single text which contains a lot of forbidden caracters. This text must * not be null * @return Returns the transformed text without specific codes. */ public static String transformHtmlCode(String text) { SilverTrace.info("util", "Encode.transformHtmlCode()", "root.MSG_GEN_PARAM_VALUE", " text recu " + text); return StringEscapeUtils.unescapeHtml4(text); }
From source file:mobisocial.musubi.util.OGUtil.java
public static OGData getOrGuess(String url) { DefaultHttpClient hc = new DefaultHttpClient(); HttpResponse res;//from w w w .j av a2 s .com try { HttpGet hg = new HttpGet(url); res = hc.execute(hg); } catch (Exception e) { Log.e(TAG, "unable to fetch page to get og tags", e); return null; } String location = url; //TODO: if some kind of redirect magic happened, then //make the location match that OGData og = new OGData(); HttpEntity he = res.getEntity(); Header content_type = he.getContentType(); //TODO: check the content directly if they forget the type header if (content_type == null || content_type.getValue() == null) { Log.e(TAG, "page missing content type ..abandoning: " + url); return null; } og.mMimeType = content_type.getValue(); //just make a thumbnail if the shared item is an image if (og.mMimeType.startsWith("image/")) { Bitmap b; try { b = BitmapFactory.decodeStream(he.getContent()); } catch (Exception e) { return null; } //TODO: scaling int w = b.getWidth(); int h = b.getHeight(); if (w > h) { h = h * 200 / w; w = 200; } else { w = w * 200 / h; h = 200; } Bitmap b2 = Bitmap.createScaledBitmap(b, w, h, true); b.recycle(); b = b2; ByteArrayOutputStream baos = new ByteArrayOutputStream(); b.compress(CompressFormat.PNG, 100, baos); og.mImage = baos.toByteArray(); b.recycle(); return og; } //if its not html, we can't extract more details, the caller //should rely on what they already know. if (!og.mMimeType.startsWith("text/html") && !og.mMimeType.startsWith("application/xhtml")) { Log.e(TAG, "shared content is not a known type for meta data processing " + og.mMimeType); return og; } String html; try { html = IOUtils.toString(he.getContent()); } catch (Exception e) { Log.e(TAG, "failed to read html content", e); return og; } Matcher m = sTitleRegex.matcher(html); if (m.find()) { og.mTitle = StringEscapeUtils.unescapeHtml4(m.group(1)); } m = sMetaRegex.matcher(html); int offset = 0; String raw_description = null; while (m.find(offset)) { try { String meta_tag = m.group(); Matcher mp = sPropertyOfMeta.matcher(meta_tag); if (!mp.find()) continue; String type = mp.group(1); type = type.substring(1, type.length() - 1); Matcher md = sContentOfMeta.matcher(meta_tag); if (!md.find()) continue; String data = md.group(1); //remove quotes data = data.substring(1, data.length() - 1); data = StringEscapeUtils.unescapeHtml4(data); if (type.equalsIgnoreCase("og:title")) { og.mTitle = data; } else if (type.equalsIgnoreCase("og:image")) { HttpResponse resi; try { HttpGet hgi = new HttpGet(data); resi = hc.execute(hgi); } catch (Exception e) { Log.e(TAG, "unable to fetch og image url", e); continue; } HttpEntity hei = resi.getEntity(); if (!hei.getContentType().getValue().startsWith("image/")) { Log.e(TAG, "image og tag points to non image data" + hei.getContentType().getValue()); } try { Bitmap b; try { b = BitmapFactory.decodeStream(hei.getContent()); } catch (Exception e) { return null; } //TODO: scaling int w = b.getWidth(); int h = b.getHeight(); if (w > h) { h = h * Math.min(200, w) / w; w = Math.min(200, w); } else { w = w * Math.min(200, h) / h; h = Math.min(200, h); } Bitmap b2 = Bitmap.createScaledBitmap(b, w, h, true); b.recycle(); b = b2; ByteArrayOutputStream baos = new ByteArrayOutputStream(); b.compress(CompressFormat.PNG, 100, baos); b.recycle(); og.mImage = baos.toByteArray(); } catch (Exception e) { Log.e(TAG, "failed to fetch image for og", e); continue; } } else if (type.equalsIgnoreCase("description")) { raw_description = data; } else if (type.equalsIgnoreCase("og:description")) { og.mDescription = data; } else if (type.equalsIgnoreCase("og:url")) { og.mUrl = data; } } finally { offset = m.end(); } } HashSet<String> already_fetched = new HashSet<String>(); if (og.mImage == null) { int max_area = 0; m = sImageRegex.matcher(html); int img_offset = 0; while (m.find(img_offset)) { try { String img_tag = m.group(); Matcher ms = sSrcOfImage.matcher(img_tag); if (!ms.find()) continue; String img_src = ms.group(1); img_src = img_src.substring(1, img_src.length() - 1); img_src = StringEscapeUtils.unescapeHtml4(img_src); //don't fetch an image twice (like little 1x1 images) if (already_fetched.contains(img_src)) continue; already_fetched.add(img_src); HttpResponse resi; try { HttpGet hgi = new HttpGet(new URL(new URL(location), img_src).toString()); resi = hc.execute(hgi); } catch (Exception e) { Log.e(TAG, "unable to fetch image url for biggest image search" + img_src, e); continue; } HttpEntity hei = resi.getEntity(); if (hei == null) { Log.w(TAG, "image missing en ..trying entity response: " + url); continue; } Header content_type_image = hei.getContentType(); if (content_type_image == null || content_type_image.getValue() == null) { Log.w(TAG, "image missing content type ..trying anyway: " + url); } if (!content_type_image.getValue().startsWith("image/")) { Log.w(TAG, "image tag points to non image data " + hei.getContentType().getValue() + " " + img_src); } try { Bitmap b; try { b = BitmapFactory.decodeStream(hei.getContent()); } catch (Exception e) { return null; } //TODO: scaling int w = b.getWidth(); int h = b.getHeight(); if (w * h <= max_area) { continue; } if (w < 32 || h < 32) { //skip dinky crap continue; } if (w > h) { h = h * Math.min(200, w) / w; w = Math.min(200, w); } else { w = w * Math.min(200, h) / h; h = Math.min(200, h); } Bitmap b2 = Bitmap.createScaledBitmap(b, w, h, true); b.recycle(); b = b2; ByteArrayOutputStream baos = new ByteArrayOutputStream(); b.compress(CompressFormat.PNG, 100, baos); og.mImage = baos.toByteArray(); b.recycle(); max_area = w * h; } catch (Exception e) { Log.e(TAG, "failed to fetch image for og", e); continue; } } finally { img_offset = m.end(); } } } if (og.mDescription == null) og.mDescription = raw_description; return og; }
From source file:com.wiseowl.WiseOwl.wikiClean.WikiClean.java
public String clean(String content) { //String content = getWikiMarkup(page); content = removeRefs(content);/*from w w w . j a v a 2 s .c o m*/ content = removeInterWikiLinks(content); content = removeParentheticals(content); content = fixUnitConversion(content); content = ImageCaptionsRemover.remove(content); content = DoubleBracesRemover.remove(content); content = removeHtmlComments(content); content = removeEmphasis(content); content = removeHeadings(content); content = removeCategoryLinks(content); content = removeLinks(content); content = removeMath(content); content = removeGallery(content); content = removeNoToc(content); content = removeIndentation(content); content = TableRemover.remove(content); // For some reason, some HTML entities are doubly encoded. content = StringEscapeUtils.unescapeHtml4(StringEscapeUtils.unescapeHtml4(content)); content = removeHtmlTags(content); // Finally, fold multiple newlines. content = compressMultipleNewlines(content); return content.trim().replaceAll("\\n+", " "); }
From source file:com.datumbox.framework.core.utilities.text.parsers.HTMLParser.java
private static String clear(String html) { return StringCleaner.removeExtraSpaces(StringEscapeUtils.unescapeHtml4(unsafeRemoveAllTags(html))); }
From source file:com.nttec.everychan.chans.krautchan.KrautBoardsListReader.java
private void handleFilter(int filter) throws IOException { switch (filter) { case FILTER_CATEGORY: skipUntilSequence(SPAN_CLOSE);// w ww . ja v a 2 s. c o m String cat = readUntilSequence(H2_CLOSE); if (!cat.contains("<span")) currentCategory = StringEscapeUtils.unescapeHtml4(cat); break; case FILTER_BOARD: skipUntilSequence(CLOSE); String board = RegexUtils.removeHtmlTags(readUntilSequence(LI_CLOSE)).trim(); Matcher boardMatcher = BOARD_PATTERN.matcher(board); if (boardMatcher.matches()) { SimpleBoardModel model = new SimpleBoardModel(); model.chan = KrautModule.CHAN_NAME; model.boardName = boardMatcher.group(1); model.boardDescription = boardMatcher.group(2); model.boardCategory = currentCategory; model.nsfw = SFW_BOARDS.indexOf(model.boardName) == -1; boards.add(model); } } }
From source file:com.ryan.ryanreader.reddit.prepared.RedditPreparedComment.java
public RedditPreparedComment(final Context context, final RedditComment comment, final RedditPreparedComment parentComment, final long timestamp, final boolean needsUpdating, final RedditPreparedPost parentPost, final RedditAccount user, final EnumSet<PrefsUtility.AppearanceCommentHeaderItems> headerItems) { this.src = comment; this.parentPost = parentPost; this.headerItems = headerItems; // TODO custom time // TODO don't fetch these every time final TypedArray appearance = context .obtainStyledAttributes(new int[] { R.attr.rrCommentHeaderBoldCol, R.attr.rrCommentHeaderAuthorCol, R.attr.rrPostSubtitleUpvoteCol, R.attr.rrPostSubtitleDownvoteCol }); rrCommentHeaderBoldCol = appearance.getColor(0, 255); rrCommentHeaderAuthorCol = appearance.getColor(1, 255); rrPostSubtitleUpvoteCol = appearance.getColor(2, 255); rrPostSubtitleDownvoteCol = appearance.getColor(3, 255); body = RedditCommentTextParser.parse(StringEscapeUtils.unescapeHtml4(comment.body)); if (comment.author_flair_text != null) { flair = StringEscapeUtils.unescapeHtml4(comment.author_flair_text); } else {/*from ww w.ja v a 2 s . co m*/ flair = null; } if (parentComment == null) { indentation = 0; } else { indentation = parentComment.indentation + 1; parentComment.addChild(this); } idAlone = comment.id; idAndType = comment.name; if (comment.likes == null) { voteDirection = 0; } else { voteDirection = Boolean.TRUE.equals(comment.likes) ? 1 : -1; } lastChange = timestamp; if (src.likes != null) { RedditChangeDataManager.getInstance(context).update(src.link_id, user, this, true); } else if (needsUpdating) { RedditChangeDataManager.getInstance(context).update(src.link_id, user, this, false); } rebuildHeader(context); }
From source file:com.aistor.modules.cms.service.ArticleService.java
@Transactional(readOnly = false) public void save(Article article) { if (article.getArticleData().getContent() != null) { article.getArticleData()/* w w w . ja v a 2s . c om*/ .setContent(StringEscapeUtils.unescapeHtml4(article.getArticleData().getContent())); } // ???? if (!SecurityUtils.getSubject().isPermitted("cms:article:audit")) { article.setStatus(Article.STATUS_AUDIT); } if (article.getId() == null) { article.setUser(UserUtils.getUser()); } article.setUpdateDate(new Date()); articleDao.clear(); articleDao.save(article); }
From source file:com.thinkgem.jeesite.modules.issue.service.IssueService.java
@Transactional(readOnly = false) public void save(IssueInfo issue) { if (issue.getContent() != null) { issue.setContent(StringEscapeUtils.unescapeHtml4(issue.getContent())); }/*from ww w . j a v a 2 s.com*/ issueDao.save(issue); }
From source file:com.microsoft.windowsazure.services.table.client.AtomPubParser.java
/** * Reserved for internal use. Parses the operation response as an entity. Parses the result returned in the * specified stream in AtomPub format into a {@link TableResult} containing an entity of the specified class type * projected using the specified resolver. * //from w w w. ja v a 2s.co m * @param xmlr * An <code>XMLStreamReader</code> on the input stream. * @param clazzType * The class type <code>T</code> implementing {@link TableEntity} for the entity returned. Set to * <code>null</code> to ignore the returned entity and copy only response properties into the * {@link TableResult} object. * @param resolver * An {@link EntityResolver} instance to project the entity into an instance of type <code>R</code>. Set * to <code>null</code> to return the entity as an instance of the class type <code>T</code>. * @param opContext * An {@link OperationContext} object used to track the execution of the operation. * @return * A {@link TableResult} containing the parsed entity result of the operation. * * @throws XMLStreamException * if an error occurs while accessing the stream. * @throws ParseException * if an error occurs while parsing the stream. * @throws InstantiationException * if an error occurs while constructing the result. * @throws IllegalAccessException * if an error occurs in reflection while parsing the result. * @throws StorageException * if a storage service error occurs. */ protected static <T extends TableEntity, R> TableResult parseEntity(final XMLStreamReader xmlr, final Class<T> clazzType, final EntityResolver<R> resolver, final OperationContext opContext) throws XMLStreamException, ParseException, InstantiationException, IllegalAccessException, StorageException { int eventType = xmlr.getEventType(); final TableResult res = new TableResult(); xmlr.require(XMLStreamConstants.START_ELEMENT, null, ODataConstants.ENTRY); res.setEtag(StringEscapeUtils.unescapeHtml4( xmlr.getAttributeValue(ODataConstants.DATA_SERVICES_METADATA_NS, ODataConstants.ETAG))); while (xmlr.hasNext()) { eventType = xmlr.next(); if (eventType == XMLStreamConstants.CHARACTERS) { xmlr.getText(); continue; } final String name = xmlr.getName().toString(); if (eventType == XMLStreamConstants.START_ELEMENT) { if (name.equals(ODataConstants.BRACKETED_ATOM_NS + ODataConstants.ID)) { res.setId(Utility.readElementFromXMLReader(xmlr, ODataConstants.ID)); } else if (name .equals(ODataConstants.BRACKETED_DATA_SERVICES_METADATA_NS + ODataConstants.PROPERTIES)) { // Do read properties if (resolver == null && clazzType == null) { return res; } else { res.setProperties(readProperties(xmlr, opContext)); break; } } } } // Move to end Content eventType = xmlr.next(); if (eventType == XMLStreamConstants.CHARACTERS) { eventType = xmlr.next(); } xmlr.require(XMLStreamConstants.END_ELEMENT, null, ODataConstants.CONTENT); eventType = xmlr.next(); if (eventType == XMLStreamConstants.CHARACTERS) { eventType = xmlr.next(); } xmlr.require(XMLStreamConstants.END_ELEMENT, null, ODataConstants.ENTRY); String rowKey = null; String partitionKey = null; Date timestamp = null; // Remove core properties from map and set individually EntityProperty tempProp = res.getProperties().get(TableConstants.PARTITION_KEY); if (tempProp != null) { res.getProperties().remove(TableConstants.PARTITION_KEY); partitionKey = tempProp.getValueAsString(); } tempProp = res.getProperties().get(TableConstants.ROW_KEY); if (tempProp != null) { res.getProperties().remove(TableConstants.ROW_KEY); rowKey = tempProp.getValueAsString(); } tempProp = res.getProperties().get(TableConstants.TIMESTAMP); if (tempProp != null) { res.getProperties().remove(TableConstants.TIMESTAMP); timestamp = tempProp.getValueAsDate(); } if (resolver != null) { // Call resolver res.setResult(resolver.resolve(partitionKey, rowKey, timestamp, res.getProperties(), res.getEtag())); } else if (clazzType != null) { // Generate new entity and return final T entity = clazzType.newInstance(); entity.setEtag(res.getEtag()); entity.setPartitionKey(partitionKey); entity.setRowKey(rowKey); entity.setTimestamp(timestamp); entity.readEntity(res.getProperties(), opContext); res.setResult(entity); } return res; }