Example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

List of usage examples for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringEscapeUtils unescapeHtml4.

Prototype

public static final String unescapeHtml4(final String input) 

Source Link

Document

Unescapes a string containing entity escapes to a string containing the actual Unicode characters corresponding to the escapes.

Usage

From source file:io.github.karols.hocr4j.dom.HocrTag.java

/**
 * Creates a new tag with the given elements
 * and with name and attributes based on the given contents of the opening tag.
 * @param openingTagString contents of the opening tag
 * @param contents elements in the body of the tag
 */// www.j  a v  a 2s.  c  o  m
public HocrTag(String openingTagString, List<HocrElement> contents) {
    final String x = openingTagString;
    int i = 1;
    while (x.charAt(i) == ' ')
        i++;
    int nameStart = i;
    while (x.charAt(i) != ' ' && x.charAt(i) != '>' && x.charAt(i) != '/') {
        i++;
    }
    name = x.substring(nameStart, i).toLowerCase(Locale.US);
    while (x.charAt(i) == ' ')
        i++;
    HashMap<String, String> attributes = new HashMap<String, String>();
    while (x.charAt(i) != '/' && x.charAt(i) != '>') {
        int attrNameStart = i;
        ing_bad: while (true) {
            switch (x.charAt(i)) {
            case '=':
            case '/':
            case ' ':
            case '>':
                break ing_bad;
            default:
                i++;
            }
        }
        String attrName = x.substring(attrNameStart, i);
        while (x.charAt(i) == ' ')
            i++;
        String attrValue = attrName;
        if (x.charAt(i) == '=') {
            i++;
            while (x.charAt(i) == ' ')
                i++;
            int attrValueStart = i;
            switch (x.charAt(i)) {
            case '\'':
                attrValueStart++;
                i++;
                while (x.charAt(i) != '\'')
                    i++;
                attrValue = x.substring(attrValueStart, i);
                i++;
                break;
            case '\"':
                attrValueStart++;
                i++;
                while (x.charAt(i) != '\"')
                    i++;
                attrValue = x.substring(attrValueStart, i);
                i++;
                break;
            default:
                while (x.charAt(i) != ' ' && x.charAt(i) != '/' && x.charAt(i) != '>')
                    i++;
                attrValue = x.substring(attrValueStart, i);
                break;
            }
        }
        while (x.charAt(i) == ' ')
            i++;
        attributes.put(StringEscapeUtils.unescapeHtml4(attrName), StringEscapeUtils.unescapeHtml4(attrValue));
    }
    this.id = attributes.get("id");
    this.clazz = attributes.get("class");
    this.title = attributes.get("title");
    this.attributes = ImmutableMap.copyOf(attributes);
    this.elements = ImmutableList.copyOf(contents);
}

From source file:com.silverpeas.util.EncodeHelper.java

/**
   * This method transforms a text with caracter specificly encoded for HTML by a text encoded in
   * according to the Java code.//w  w w . jav  a  2  s . com
   *
   * @param text (String) a single text which contains a lot of forbidden caracters. This text must
   * not be null
   * @return Returns the transformed text without specific codes.
   */
  public static String transformHtmlCode(String text) {
      SilverTrace.info("util", "Encode.transformHtmlCode()", "root.MSG_GEN_PARAM_VALUE", " text recu " + text);

      return StringEscapeUtils.unescapeHtml4(text);
  }

From source file:mobisocial.musubi.util.OGUtil.java

public static OGData getOrGuess(String url) {
    DefaultHttpClient hc = new DefaultHttpClient();
    HttpResponse res;//from   w  w  w .j  av a2  s  .com
    try {
        HttpGet hg = new HttpGet(url);
        res = hc.execute(hg);
    } catch (Exception e) {
        Log.e(TAG, "unable to fetch page to get og tags", e);
        return null;
    }
    String location = url;
    //TODO: if some kind of redirect magic happened, then
    //make the location match that

    OGData og = new OGData();
    HttpEntity he = res.getEntity();
    Header content_type = he.getContentType();
    //TODO: check the content directly if they forget the type header
    if (content_type == null || content_type.getValue() == null) {
        Log.e(TAG, "page missing content type ..abandoning: " + url);
        return null;
    }
    og.mMimeType = content_type.getValue();
    //just make a thumbnail if the shared item is an image
    if (og.mMimeType.startsWith("image/")) {
        Bitmap b;
        try {
            b = BitmapFactory.decodeStream(he.getContent());
        } catch (Exception e) {
            return null;
        }
        //TODO: scaling
        int w = b.getWidth();
        int h = b.getHeight();
        if (w > h) {
            h = h * 200 / w;
            w = 200;
        } else {
            w = w * 200 / h;
            h = 200;
        }

        Bitmap b2 = Bitmap.createScaledBitmap(b, w, h, true);
        b.recycle();
        b = b2;
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        b.compress(CompressFormat.PNG, 100, baos);
        og.mImage = baos.toByteArray();
        b.recycle();
        return og;
    }
    //if its not html, we can't extract more details, the caller
    //should rely on what they already know.
    if (!og.mMimeType.startsWith("text/html") && !og.mMimeType.startsWith("application/xhtml")) {
        Log.e(TAG, "shared content is not a known type for meta data processing " + og.mMimeType);
        return og;
    }

    String html;
    try {
        html = IOUtils.toString(he.getContent());
    } catch (Exception e) {
        Log.e(TAG, "failed to read html content", e);
        return og;
    }

    Matcher m = sTitleRegex.matcher(html);
    if (m.find()) {
        og.mTitle = StringEscapeUtils.unescapeHtml4(m.group(1));

    }
    m = sMetaRegex.matcher(html);
    int offset = 0;
    String raw_description = null;
    while (m.find(offset)) {
        try {
            String meta_tag = m.group();
            Matcher mp = sPropertyOfMeta.matcher(meta_tag);
            if (!mp.find())
                continue;
            String type = mp.group(1);
            type = type.substring(1, type.length() - 1);
            Matcher md = sContentOfMeta.matcher(meta_tag);
            if (!md.find())
                continue;
            String data = md.group(1);
            //remove quotes
            data = data.substring(1, data.length() - 1);
            data = StringEscapeUtils.unescapeHtml4(data);
            if (type.equalsIgnoreCase("og:title")) {
                og.mTitle = data;
            } else if (type.equalsIgnoreCase("og:image")) {
                HttpResponse resi;
                try {
                    HttpGet hgi = new HttpGet(data);
                    resi = hc.execute(hgi);
                } catch (Exception e) {
                    Log.e(TAG, "unable to fetch og image url", e);
                    continue;
                }
                HttpEntity hei = resi.getEntity();
                if (!hei.getContentType().getValue().startsWith("image/")) {
                    Log.e(TAG, "image og tag points to non image data" + hei.getContentType().getValue());
                }
                try {
                    Bitmap b;
                    try {
                        b = BitmapFactory.decodeStream(hei.getContent());
                    } catch (Exception e) {
                        return null;
                    }
                    //TODO: scaling
                    int w = b.getWidth();
                    int h = b.getHeight();
                    if (w > h) {
                        h = h * Math.min(200, w) / w;
                        w = Math.min(200, w);
                    } else {
                        w = w * Math.min(200, h) / h;
                        h = Math.min(200, h);
                    }
                    Bitmap b2 = Bitmap.createScaledBitmap(b, w, h, true);
                    b.recycle();
                    b = b2;
                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
                    b.compress(CompressFormat.PNG, 100, baos);
                    b.recycle();
                    og.mImage = baos.toByteArray();
                } catch (Exception e) {
                    Log.e(TAG, "failed to fetch image for og", e);
                    continue;
                }
            } else if (type.equalsIgnoreCase("description")) {
                raw_description = data;
            } else if (type.equalsIgnoreCase("og:description")) {
                og.mDescription = data;
            } else if (type.equalsIgnoreCase("og:url")) {
                og.mUrl = data;
            }
        } finally {
            offset = m.end();
        }
    }
    HashSet<String> already_fetched = new HashSet<String>();
    if (og.mImage == null) {
        int max_area = 0;
        m = sImageRegex.matcher(html);
        int img_offset = 0;
        while (m.find(img_offset)) {
            try {
                String img_tag = m.group();
                Matcher ms = sSrcOfImage.matcher(img_tag);
                if (!ms.find())
                    continue;
                String img_src = ms.group(1);
                img_src = img_src.substring(1, img_src.length() - 1);
                img_src = StringEscapeUtils.unescapeHtml4(img_src);
                //don't fetch an image twice (like little 1x1 images)
                if (already_fetched.contains(img_src))
                    continue;
                already_fetched.add(img_src);
                HttpResponse resi;
                try {
                    HttpGet hgi = new HttpGet(new URL(new URL(location), img_src).toString());
                    resi = hc.execute(hgi);
                } catch (Exception e) {
                    Log.e(TAG, "unable to fetch image url for biggest image search" + img_src, e);
                    continue;
                }
                HttpEntity hei = resi.getEntity();
                if (hei == null) {
                    Log.w(TAG, "image missing en ..trying entity response: " + url);
                    continue;
                }
                Header content_type_image = hei.getContentType();
                if (content_type_image == null || content_type_image.getValue() == null) {
                    Log.w(TAG, "image missing content type ..trying anyway: " + url);
                }
                if (!content_type_image.getValue().startsWith("image/")) {
                    Log.w(TAG, "image tag points to non image data " + hei.getContentType().getValue() + " "
                            + img_src);
                }
                try {
                    Bitmap b;
                    try {
                        b = BitmapFactory.decodeStream(hei.getContent());
                    } catch (Exception e) {
                        return null;
                    }
                    //TODO: scaling
                    int w = b.getWidth();
                    int h = b.getHeight();
                    if (w * h <= max_area) {
                        continue;
                    }
                    if (w < 32 || h < 32) {
                        //skip dinky crap
                        continue;
                    }
                    if (w > h) {
                        h = h * Math.min(200, w) / w;
                        w = Math.min(200, w);
                    } else {
                        w = w * Math.min(200, h) / h;
                        h = Math.min(200, h);
                    }
                    Bitmap b2 = Bitmap.createScaledBitmap(b, w, h, true);
                    b.recycle();
                    b = b2;
                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
                    b.compress(CompressFormat.PNG, 100, baos);
                    og.mImage = baos.toByteArray();
                    b.recycle();
                    max_area = w * h;
                } catch (Exception e) {
                    Log.e(TAG, "failed to fetch image for og", e);
                    continue;
                }
            } finally {
                img_offset = m.end();
            }
        }

    }
    if (og.mDescription == null)
        og.mDescription = raw_description;
    return og;
}

From source file:com.wiseowl.WiseOwl.wikiClean.WikiClean.java

public String clean(String content) {
    //String content = getWikiMarkup(page);
    content = removeRefs(content);/*from  w  w  w  .  j  a v  a 2  s .c o m*/
    content = removeInterWikiLinks(content);
    content = removeParentheticals(content);
    content = fixUnitConversion(content);
    content = ImageCaptionsRemover.remove(content);
    content = DoubleBracesRemover.remove(content);
    content = removeHtmlComments(content);
    content = removeEmphasis(content);
    content = removeHeadings(content);
    content = removeCategoryLinks(content);
    content = removeLinks(content);
    content = removeMath(content);
    content = removeGallery(content);
    content = removeNoToc(content);
    content = removeIndentation(content);

    content = TableRemover.remove(content);

    // For some reason, some HTML entities are doubly encoded.
    content = StringEscapeUtils.unescapeHtml4(StringEscapeUtils.unescapeHtml4(content));
    content = removeHtmlTags(content);

    // Finally, fold multiple newlines.
    content = compressMultipleNewlines(content);
    return content.trim().replaceAll("\\n+", " ");
}

From source file:com.datumbox.framework.core.utilities.text.parsers.HTMLParser.java

private static String clear(String html) {
    return StringCleaner.removeExtraSpaces(StringEscapeUtils.unescapeHtml4(unsafeRemoveAllTags(html)));
}

From source file:com.nttec.everychan.chans.krautchan.KrautBoardsListReader.java

private void handleFilter(int filter) throws IOException {
    switch (filter) {
    case FILTER_CATEGORY:
        skipUntilSequence(SPAN_CLOSE);// w  ww  . ja  v a  2  s.  c  o  m
        String cat = readUntilSequence(H2_CLOSE);
        if (!cat.contains("<span"))
            currentCategory = StringEscapeUtils.unescapeHtml4(cat);
        break;
    case FILTER_BOARD:
        skipUntilSequence(CLOSE);
        String board = RegexUtils.removeHtmlTags(readUntilSequence(LI_CLOSE)).trim();
        Matcher boardMatcher = BOARD_PATTERN.matcher(board);
        if (boardMatcher.matches()) {
            SimpleBoardModel model = new SimpleBoardModel();
            model.chan = KrautModule.CHAN_NAME;
            model.boardName = boardMatcher.group(1);
            model.boardDescription = boardMatcher.group(2);
            model.boardCategory = currentCategory;
            model.nsfw = SFW_BOARDS.indexOf(model.boardName) == -1;
            boards.add(model);
        }
    }
}

From source file:com.ryan.ryanreader.reddit.prepared.RedditPreparedComment.java

public RedditPreparedComment(final Context context, final RedditComment comment,
        final RedditPreparedComment parentComment, final long timestamp, final boolean needsUpdating,
        final RedditPreparedPost parentPost, final RedditAccount user,
        final EnumSet<PrefsUtility.AppearanceCommentHeaderItems> headerItems) {

    this.src = comment;
    this.parentPost = parentPost;
    this.headerItems = headerItems;

    // TODO custom time

    // TODO don't fetch these every time
    final TypedArray appearance = context
            .obtainStyledAttributes(new int[] { R.attr.rrCommentHeaderBoldCol, R.attr.rrCommentHeaderAuthorCol,
                    R.attr.rrPostSubtitleUpvoteCol, R.attr.rrPostSubtitleDownvoteCol });

    rrCommentHeaderBoldCol = appearance.getColor(0, 255);
    rrCommentHeaderAuthorCol = appearance.getColor(1, 255);
    rrPostSubtitleUpvoteCol = appearance.getColor(2, 255);
    rrPostSubtitleDownvoteCol = appearance.getColor(3, 255);

    body = RedditCommentTextParser.parse(StringEscapeUtils.unescapeHtml4(comment.body));
    if (comment.author_flair_text != null) {
        flair = StringEscapeUtils.unescapeHtml4(comment.author_flair_text);
    } else {/*from   ww  w.ja v  a  2 s .  co  m*/
        flair = null;
    }

    if (parentComment == null) {
        indentation = 0;
    } else {
        indentation = parentComment.indentation + 1;
        parentComment.addChild(this);
    }

    idAlone = comment.id;
    idAndType = comment.name;

    if (comment.likes == null) {
        voteDirection = 0;
    } else {
        voteDirection = Boolean.TRUE.equals(comment.likes) ? 1 : -1;
    }

    lastChange = timestamp;
    if (src.likes != null) {
        RedditChangeDataManager.getInstance(context).update(src.link_id, user, this, true);
    } else if (needsUpdating) {
        RedditChangeDataManager.getInstance(context).update(src.link_id, user, this, false);
    }

    rebuildHeader(context);
}

From source file:com.aistor.modules.cms.service.ArticleService.java

@Transactional(readOnly = false)
public void save(Article article) {
    if (article.getArticleData().getContent() != null) {
        article.getArticleData()/* w w w .  ja v a 2s .  c om*/
                .setContent(StringEscapeUtils.unescapeHtml4(article.getArticleData().getContent()));
    }
    // ????
    if (!SecurityUtils.getSubject().isPermitted("cms:article:audit")) {
        article.setStatus(Article.STATUS_AUDIT);
    }
    if (article.getId() == null) {
        article.setUser(UserUtils.getUser());
    }
    article.setUpdateDate(new Date());
    articleDao.clear();
    articleDao.save(article);
}

From source file:com.thinkgem.jeesite.modules.issue.service.IssueService.java

@Transactional(readOnly = false)
public void save(IssueInfo issue) {
    if (issue.getContent() != null) {
        issue.setContent(StringEscapeUtils.unescapeHtml4(issue.getContent()));
    }/*from   ww w . j a  v  a  2  s.com*/
    issueDao.save(issue);
}

From source file:com.microsoft.windowsazure.services.table.client.AtomPubParser.java

/**
 * Reserved for internal use. Parses the operation response as an entity. Parses the result returned in the
 * specified stream in AtomPub format into a {@link TableResult} containing an entity of the specified class type
 * projected using the specified resolver.
 * //from   w  w w.  ja  v a  2s.co m
 * @param xmlr
 *            An <code>XMLStreamReader</code> on the input stream.
 * @param clazzType
 *            The class type <code>T</code> implementing {@link TableEntity} for the entity returned. Set to
 *            <code>null</code> to ignore the returned entity and copy only response properties into the
 *            {@link TableResult} object.
 * @param resolver
 *            An {@link EntityResolver} instance to project the entity into an instance of type <code>R</code>. Set
 *            to <code>null</code> to return the entity as an instance of the class type <code>T</code>.
 * @param opContext
 *            An {@link OperationContext} object used to track the execution of the operation.
 * @return
 *         A {@link TableResult} containing the parsed entity result of the operation.
 * 
 * @throws XMLStreamException
 *             if an error occurs while accessing the stream.
 * @throws ParseException
 *             if an error occurs while parsing the stream.
 * @throws InstantiationException
 *             if an error occurs while constructing the result.
 * @throws IllegalAccessException
 *             if an error occurs in reflection while parsing the result.
 * @throws StorageException
 *             if a storage service error occurs.
 */
protected static <T extends TableEntity, R> TableResult parseEntity(final XMLStreamReader xmlr,
        final Class<T> clazzType, final EntityResolver<R> resolver, final OperationContext opContext)
        throws XMLStreamException, ParseException, InstantiationException, IllegalAccessException,
        StorageException {
    int eventType = xmlr.getEventType();
    final TableResult res = new TableResult();

    xmlr.require(XMLStreamConstants.START_ELEMENT, null, ODataConstants.ENTRY);

    res.setEtag(StringEscapeUtils.unescapeHtml4(
            xmlr.getAttributeValue(ODataConstants.DATA_SERVICES_METADATA_NS, ODataConstants.ETAG)));

    while (xmlr.hasNext()) {
        eventType = xmlr.next();
        if (eventType == XMLStreamConstants.CHARACTERS) {
            xmlr.getText();
            continue;
        }

        final String name = xmlr.getName().toString();

        if (eventType == XMLStreamConstants.START_ELEMENT) {
            if (name.equals(ODataConstants.BRACKETED_ATOM_NS + ODataConstants.ID)) {
                res.setId(Utility.readElementFromXMLReader(xmlr, ODataConstants.ID));
            } else if (name
                    .equals(ODataConstants.BRACKETED_DATA_SERVICES_METADATA_NS + ODataConstants.PROPERTIES)) {
                // Do read properties
                if (resolver == null && clazzType == null) {
                    return res;
                } else {
                    res.setProperties(readProperties(xmlr, opContext));
                    break;
                }
            }
        }
    }

    // Move to end Content
    eventType = xmlr.next();
    if (eventType == XMLStreamConstants.CHARACTERS) {
        eventType = xmlr.next();
    }
    xmlr.require(XMLStreamConstants.END_ELEMENT, null, ODataConstants.CONTENT);

    eventType = xmlr.next();
    if (eventType == XMLStreamConstants.CHARACTERS) {
        eventType = xmlr.next();
    }

    xmlr.require(XMLStreamConstants.END_ELEMENT, null, ODataConstants.ENTRY);

    String rowKey = null;
    String partitionKey = null;
    Date timestamp = null;

    // Remove core properties from map and set individually
    EntityProperty tempProp = res.getProperties().get(TableConstants.PARTITION_KEY);
    if (tempProp != null) {
        res.getProperties().remove(TableConstants.PARTITION_KEY);
        partitionKey = tempProp.getValueAsString();
    }

    tempProp = res.getProperties().get(TableConstants.ROW_KEY);
    if (tempProp != null) {
        res.getProperties().remove(TableConstants.ROW_KEY);
        rowKey = tempProp.getValueAsString();
    }

    tempProp = res.getProperties().get(TableConstants.TIMESTAMP);
    if (tempProp != null) {
        res.getProperties().remove(TableConstants.TIMESTAMP);
        timestamp = tempProp.getValueAsDate();
    }

    if (resolver != null) {
        // Call resolver
        res.setResult(resolver.resolve(partitionKey, rowKey, timestamp, res.getProperties(), res.getEtag()));
    } else if (clazzType != null) {
        // Generate new entity and return
        final T entity = clazzType.newInstance();
        entity.setEtag(res.getEtag());

        entity.setPartitionKey(partitionKey);
        entity.setRowKey(rowKey);
        entity.setTimestamp(timestamp);

        entity.readEntity(res.getProperties(), opContext);

        res.setResult(entity);
    }

    return res;
}