Java Regex String Replace HTML removeTag(String text)

Here you can find the source of removeTag(String text)

Description

remove Tag

License

Open Source License

Declaration

public static final String removeTag(String text) 

Method Source Code

//package com.java2s;
/**   //from   w  ww  .  jav a2s  .c om
 *
 * Copyright (C) 2009-2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {
    private final static Pattern removeTagPattern = Pattern.compile("<[^>]*>");
    private final static Pattern removeBrPattern1 = Pattern.compile("\\.\\p{Space}+<br\\p{Space}*/?>",
            Pattern.CASE_INSENSITIVE);
    private final static Pattern removeEndTagBlockPattern1 = Pattern.compile(
            "\\.\\p{Space}+</(p|td|div|h1|h2|h3|h4|h5|h6|hr|li|option|pre|select|table|tbody|td|textarea|tfoot|thead|th|title|tr|ul)>",
            Pattern.CASE_INSENSITIVE);
    private final static Pattern removeEndTagBlockPattern2 = Pattern.compile(
            "</(p|td|div|h1|h2|h3|h4|h5|h6|hr|li|option|pre|select|table|tbody|td|textarea|tfoot|thead|th|title|tr|ul)>",
            Pattern.CASE_INSENSITIVE);
    private final static Pattern removeBrPattern2 = Pattern.compile("<br\\p{Space}*/?>", Pattern.CASE_INSENSITIVE);
    private final static Pattern removeScriptObjectStylePattern = Pattern
            .compile("<(script|object|style)[^>]*>[^<]*</(script|object|style)>", Pattern.CASE_INSENSITIVE);

    public static final String removeTag(String text) {
        text = replaceConsecutiveSpaces(text, " ");
        synchronized (removeScriptObjectStylePattern) {
            text = removeScriptObjectStylePattern.matcher(text).replaceAll("");
        }
        synchronized (removeBrPattern1) {
            text = removeBrPattern1.matcher(text).replaceAll("</p>");
        }
        synchronized (removeEndTagBlockPattern1) {
            text = removeEndTagBlockPattern1.matcher(text).replaceAll("</p>");
        }
        synchronized (removeEndTagBlockPattern2) {
            text = removeEndTagBlockPattern2.matcher(text).replaceAll(". ");
        }
        synchronized (removeBrPattern2) {
            text = removeBrPattern2.matcher(text).replaceAll(". ");
        }
        synchronized (removeTagPattern) {
            text = removeTagPattern.matcher(text).replaceAll("");
        }
        text = replaceConsecutiveSpaces(text, " ");
        return text;
    }

    public static final String removeTag(String text, String[] allowedTags) {
        if (allowedTags == null)
            text = replaceConsecutiveSpaces(text, " ");
        StringBuffer sb = new StringBuffer();
        Matcher matcher;
        synchronized (removeTagPattern) {
            matcher = removeTagPattern.matcher(text);
        }
        while (matcher.find()) {
            boolean allowed = false;
            String group = matcher.group();
            if (allowedTags != null) {
                for (String tag : allowedTags) {
                    if (tag.equals(group)) {
                        allowed = true;
                        break;
                    }
                }
            }
            matcher.appendReplacement(sb, allowed ? group : "");
        }
        matcher.appendTail(sb);
        return sb.toString();
    }

    public static final String replaceConsecutiveSpaces(String source, String replace) {
        StringBuilder target = new StringBuilder();
        int l = source.length();
        boolean consecutiveSpace = false;
        for (int i = 0; i < l; i++) {
            char c = source.charAt(i);
            if (Character.isWhitespace(c)) {
                if (!consecutiveSpace) {
                    if (replace != null)
                        target.append(replace);
                    consecutiveSpace = true;
                }
            } else {
                target.append(c);
                if (consecutiveSpace)
                    consecutiveSpace = false;
            }
        }
        return target.toString();
    }
}

Related

  1. removeAllHtmlTag(String str)
  2. removeAllTags(String html)
  3. removeAllTags(String htmlText)
  4. removeAllTags(String input)
  5. removeTag(String tagname, String xmlstring)
  6. removeTags(String html)
  7. removeTags(String input, List knownTagList)
  8. removeTags(String string)
  9. replaceHtml(String html)