Pattern helper
/* * Static String formatting and query routines. * Copyright (C) 2001-2005 Stephen Ostermiller * http://ostermiller.org/contact.pl?regarding=Java+Utilities * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * See COPYING.TXT for details. */ import java.util.HashMap; import java.util.regex.Pattern; /** * Utilities for String formatting, manipulation, and queries. * More information about this class is available from <a target="_top" href= * "http://ostermiller.org/utils/StringHelper.html">ostermiller.org</a>. * * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities * @since ostermillerutils 1.00.00 */ public class StringHelper { /** * Build a regular expression that is each of the terms or'd together. * * @param terms a list of search terms. * @param sb place to build the regular expression. * @throws IllegalArgumentException if the length of terms is zero. * * @since ostermillerutils 1.02.25 */ private static void buildFindAnyPattern(String[] terms, StringBuffer sb){ if (terms.length == 0) throw new IllegalArgumentException("There must be at least one term to find."); sb.append("(?:"); for (int i=0; i<terms.length; i++){ if (i>0) sb.append("|"); sb.append("(?:"); sb.append(escapeRegularExpressionLiteral(terms[i])); sb.append(")"); } sb.append(")"); } /** * Compile a pattern that can will match a string if the string * contains any of the given terms. * <p> * Usage:<br> * <code>boolean b = getContainsAnyPattern(terms).matcher(s).matches();</code> * <p> * If multiple strings are matched against the same set of terms, * it is more efficient to reuse the pattern returned by this function. * * @param terms Array of search strings. * @return Compiled pattern that can be used to match a string to see if it contains any of the terms. * * @since ostermillerutils 1.02.25 */ public static Pattern getContainsAnyPattern(String[] terms){ StringBuffer sb = new StringBuffer(); sb.append("(?s).*"); buildFindAnyPattern(terms, sb); sb.append(".*"); return Pattern.compile(sb.toString()); } /** * Compile a pattern that can will match a string if the string * equals any of the given terms. * <p> * Usage:<br> * <code>boolean b = getEqualsAnyPattern(terms).matcher(s).matches();</code> * <p> * If multiple strings are matched against the same set of terms, * it is more efficient to reuse the pattern returned by this function. * * @param terms Array of search strings. * @return Compiled pattern that can be used to match a string to see if it equals any of the terms. * * @since ostermillerutils 1.02.25 */ public static Pattern getEqualsAnyPattern(String[] terms){ StringBuffer sb = new StringBuffer(); sb.append("(?s)\\A"); buildFindAnyPattern(terms, sb); sb.append("\\z"); return Pattern.compile(sb.toString()); } /** * Compile a pattern that can will match a string if the string * starts with any of the given terms. * <p> * Usage:<br> * <code>boolean b = getStartsWithAnyPattern(terms).matcher(s).matches();</code> * <p> * If multiple strings are matched against the same set of terms, * it is more efficient to reuse the pattern returned by this function. * * @param terms Array of search strings. * @return Compiled pattern that can be used to match a string to see if it starts with any of the terms. * * @since ostermillerutils 1.02.25 */ public static Pattern getStartsWithAnyPattern(String[] terms){ StringBuffer sb = new StringBuffer(); sb.append("(?s)\\A"); buildFindAnyPattern(terms, sb); sb.append(".*"); return Pattern.compile(sb.toString()); } /** * Compile a pattern that can will match a string if the string * ends with any of the given terms. * <p> * Usage:<br> * <code>boolean b = getEndsWithAnyPattern(terms).matcher(s).matches();</code> * <p> * If multiple strings are matched against the same set of terms, * it is more efficient to reuse the pattern returned by this function. * * @param terms Array of search strings. * @return Compiled pattern that can be used to match a string to see if it ends with any of the terms. * * @since ostermillerutils 1.02.25 */ public static Pattern getEndsWithAnyPattern(String[] terms){ StringBuffer sb = new StringBuffer(); sb.append("(?s).*"); buildFindAnyPattern(terms, sb); sb.append("\\z"); return Pattern.compile(sb.toString()); } /** * Compile a pattern that can will match a string if the string * contains any of the given terms. * <p> * Case is ignored when matching using Unicode case rules. * <p> * Usage:<br> * <code>boolean b = getContainsAnyPattern(terms).matcher(s).matches();</code> * <p> * If multiple strings are matched against the same set of terms, * it is more efficient to reuse the pattern returned by this function. * * @param terms Array of search strings. * @return Compiled pattern that can be used to match a string to see if it contains any of the terms. * * @since ostermillerutils 1.02.25 */ public static Pattern getContainsAnyIgnoreCasePattern(String[] terms){ StringBuffer sb = new StringBuffer(); sb.append("(?i)(?u)(?s).*"); buildFindAnyPattern(terms, sb); sb.append(".*"); return Pattern.compile(sb.toString()); } /** * Compile a pattern that can will match a string if the string * equals any of the given terms. * <p> * Case is ignored when matching using Unicode case rules. * <p> * Usage:<br> * <code>boolean b = getEqualsAnyPattern(terms).matcher(s).matches();</code> * <p> * If multiple strings are matched against the same set of terms, * it is more efficient to reuse the pattern returned by this function. * * @param terms Array of search strings. * @return Compiled pattern that can be used to match a string to see if it equals any of the terms. * * @since ostermillerutils 1.02.25 */ public static Pattern getEqualsAnyIgnoreCasePattern(String[] terms){ StringBuffer sb = new StringBuffer(); sb.append("(?i)(?u)(?s)\\A"); buildFindAnyPattern(terms, sb); sb.append("\\z"); return Pattern.compile(sb.toString()); } /** * Compile a pattern that can will match a string if the string * starts with any of the given terms. * <p> * Case is ignored when matching using Unicode case rules. * <p> * Usage:<br> * <code>boolean b = getStartsWithAnyPattern(terms).matcher(s).matches();</code> * <p> * If multiple strings are matched against the same set of terms, * it is more efficient to reuse the pattern returned by this function. * * @param terms Array of search strings. * @return Compiled pattern that can be used to match a string to see if it starts with any of the terms. * * @since ostermillerutils 1.02.25 */ public static Pattern getStartsWithAnyIgnoreCasePattern(String[] terms){ StringBuffer sb = new StringBuffer(); sb.append("(?i)(?u)(?s)\\A"); buildFindAnyPattern(terms, sb); sb.append(".*"); return Pattern.compile(sb.toString()); } /** * Compile a pattern that can will match a string if the string * ends with any of the given terms. * <p> * Case is ignored when matching using Unicode case rules. * <p> * Usage:<br> * <code>boolean b = getEndsWithAnyPattern(terms).matcher(s).matches();</code> * <p> * If multiple strings are matched against the same set of terms, * it is more efficient to reuse the pattern returned by this function. * * @param terms Array of search strings. * @return Compiled pattern that can be used to match a string to see if it ends with any of the terms. * * @since ostermillerutils 1.02.25 */ public static Pattern getEndsWithAnyIgnoreCasePattern(String[] terms){ StringBuffer sb = new StringBuffer(); sb.append("(?i)(?u)(?s).*"); buildFindAnyPattern(terms, sb); sb.append("\\z"); return Pattern.compile(sb.toString()); } /** * Tests to see if the given string contains any of the given terms. * <p> * This implementation is more efficient than the brute force approach * of testing the string against each of the terms. It instead compiles * a single regular expression that can test all the terms at once, and * uses that expression against the string. * <p> * This is a convenience method. If multiple strings are tested against * the same set of terms, it is more efficient not to compile the regular * expression multiple times. * @see #getContainsAnyPattern(String[]) * * @param s String that may contain any of the given terms. * @param terms list of substrings that may be contained in the given string. * @return true iff one of the terms is a substring of the given string. * * @since ostermillerutils 1.02.25 */ public static boolean containsAny(String s, String[] terms){ return getContainsAnyPattern(terms).matcher(s).matches(); } /** * Tests to see if the given string equals any of the given terms. * <p> * This implementation is more efficient than the brute force approach * of testing the string against each of the terms. It instead compiles * a single regular expression that can test all the terms at once, and * uses that expression against the string. * <p> * This is a convenience method. If multiple strings are tested against * the same set of terms, it is more efficient not to compile the regular * expression multiple times. * @see #getEqualsAnyPattern(String[]) * * @param s String that may equal any of the given terms. * @param terms list of strings that may equal the given string. * @return true iff one of the terms is equal to the given string. * * @since ostermillerutils 1.02.25 */ public static boolean equalsAny(String s, String[] terms){ return getEqualsAnyPattern(terms).matcher(s).matches(); } /** * Tests to see if the given string starts with any of the given terms. * <p> * This implementation is more efficient than the brute force approach * of testing the string against each of the terms. It instead compiles * a single regular expression that can test all the terms at once, and * uses that expression against the string. * <p> * This is a convenience method. If multiple strings are tested against * the same set of terms, it is more efficient not to compile the regular * expression multiple times. * @see #getStartsWithAnyPattern(String[]) * * @param s String that may start with any of the given terms. * @param terms list of strings that may start with the given string. * @return true iff the given string starts with one of the given terms. * * @since ostermillerutils 1.02.25 */ public static boolean startsWithAny(String s, String[] terms){ return getStartsWithAnyPattern(terms).matcher(s).matches(); } /** * Tests to see if the given string ends with any of the given terms. * <p> * This implementation is more efficient than the brute force approach * of testing the string against each of the terms. It instead compiles * a single regular expression that can test all the terms at once, and * uses that expression against the string. * <p> * This is a convenience method. If multiple strings are tested against * the same set of terms, it is more efficient not to compile the regular * expression multiple times. * @see #getEndsWithAnyPattern(String[]) * * @param s String that may end with any of the given terms. * @param terms list of strings that may end with the given string. * @return true iff the given string ends with one of the given terms. * * @since ostermillerutils 1.02.25 */ public static boolean endsWithAny(String s, String[] terms){ return getEndsWithAnyPattern(terms).matcher(s).matches(); } /** * Tests to see if the given string contains any of the given terms. * <p> * Case is ignored when matching using Unicode case rules. * <p> * This implementation is more efficient than the brute force approach * of testing the string against each of the terms. It instead compiles * a single regular expression that can test all the terms at once, and * uses that expression against the string. * <p> * This is a convenience method. If multiple strings are tested against * the same set of terms, it is more efficient not to compile the regular * expression multiple times. * @see #getContainsAnyIgnoreCasePattern(String[]) * * @param s String that may contain any of the given terms. * @param terms list of substrings that may be contained in the given string. * @return true iff one of the terms is a substring of the given string. * * @since ostermillerutils 1.02.25 */ public static boolean containsAnyIgnoreCase(String s, String[] terms){ return getContainsAnyIgnoreCasePattern(terms).matcher(s).matches(); } /** * Tests to see if the given string equals any of the given terms. * <p> * Case is ignored when matching using Unicode case rules. * <p> * This implementation is more efficient than the brute force approach * of testing the string against each of the terms. It instead compiles * a single regular expression that can test all the terms at once, and * uses that expression against the string. * <p> * This is a convenience method. If multiple strings are tested against * the same set of terms, it is more efficient not to compile the regular * expression multiple times. * @see #getEqualsAnyIgnoreCasePattern(String[]) * * @param s String that may equal any of the given terms. * @param terms list of strings that may equal the given string. * @return true iff one of the terms is equal to the given string. * * @since ostermillerutils 1.02.25 */ public static boolean equalsAnyIgnoreCase(String s, String[] terms){ return getEqualsAnyIgnoreCasePattern(terms).matcher(s).matches(); } /** * Tests to see if the given string starts with any of the given terms. * <p> * Case is ignored when matching using Unicode case rules. * <p> * This implementation is more efficient than the brute force approach * of testing the string against each of the terms. It instead compiles * a single regular expression that can test all the terms at once, and * uses that expression against the string. * <p> * This is a convenience method. If multiple strings are tested against * the same set of terms, it is more efficient not to compile the regular * expression multiple times. * @see #getStartsWithAnyIgnoreCasePattern(String[]) * * @param s String that may start with any of the given terms. * @param terms list of strings that may start with the given string. * @return true iff the given string starts with one of the given terms. * * @since ostermillerutils 1.02.25 */ public static boolean startsWithAnyIgnoreCase(String s, String[] terms){ return getStartsWithAnyIgnoreCasePattern(terms).matcher(s).matches(); } /** * Tests to see if the given string ends with any of the given terms. * <p> * Case is ignored when matching using Unicode case rules. * <p> * This implementation is more efficient than the brute force approach * of testing the string against each of the terms. It instead compiles * a single regular expression that can test all the terms at once, and * uses that expression against the string. * <p> * This is a convenience method. If multiple strings are tested against * the same set of terms, it is more efficient not to compile the regular * expression multiple times. * @see #getEndsWithAnyIgnoreCasePattern(String[]) * * @param s String that may end with any of the given terms. * @param terms list of strings that may end with the given string. * @return true iff the given string ends with one of the given terms. * * @since ostermillerutils 1.02.25 */ public static boolean endsWithAnyIgnoreCase(String s, String[] terms){ return getEndsWithAnyIgnoreCasePattern(terms).matcher(s).matches(); } /** * Escapes characters that have special meaning to * regular expressions * * @param s String to be escaped * @return escaped String * @throws NullPointerException if s is null. * * @since ostermillerutils 1.02.25 */ public static String escapeRegularExpressionLiteral(String s){ // According to the documentation in the Pattern class: // // The backslash character ('\') serves to introduce escaped constructs, // as defined in the table above, as well as to quote characters that // otherwise would be interpreted as unescaped constructs. Thus the // expression \\ matches a single backslash and \{ matches a left brace. // // It is an error to use a backslash prior to any alphabetic character // that does not denote an escaped construct; these are reserved for future // extensions to the regular-expression language. A backslash may be used // prior to a non-alphabetic character regardless of whether that character // is part of an unescaped construct. // // As a result, escape everything except [0-9a-zA-Z] int length = s.length(); int newLength = length; // first check for characters that might // be dangerous and calculate a length // of the string that has escapes. for (int i=0; i<length; i++){ char c = s.charAt(i); if (!((c>='0' && c<='9') || (c>='A' && c<='Z') || (c>='a' && c<='z'))){ newLength += 1; } } if (length == newLength){ // nothing to escape in the string return s; } StringBuffer sb = new StringBuffer(newLength); for (int i=0; i<length; i++){ char c = s.charAt(i); if (!((c>='0' && c<='9') || (c>='A' && c<='Z') || (c>='a' && c<='z'))){ sb.append('\\'); } sb.append(c); } return sb.toString(); } }