Example usage for java.util.regex Pattern UNICODE_CASE

List of usage examples for java.util.regex Pattern UNICODE_CASE

Introduction

In this page you can find the example usage for java.util.regex Pattern UNICODE_CASE.

Prototype

int UNICODE_CASE

To view the source code for java.util.regex Pattern UNICODE_CASE.

Click Source Link

Document

Enables Unicode-aware case folding.

Usage

From source file:com.yahoo.flowetl.core.util.RegexUtil.java

/**
 * Gets the pattern for the given string by providing the rules to do
 * extraction./*  w  ww  .j  av a  2  s  . c  om*/
 * 
 * This is similar to how php does regex to match you provide in the format
 * /REGEX/options where options currently are "i" for case insensitive and
 * "u" for unicode and "m" for multiline and "s" for dotall and the value
 * inside the // is the regex to use
 * 
 * @param str
 *            the string to parsed the pattern out of
 * 
 * @param cache
 *            whether to cache the compiled pattern
 * 
 * @return the pattern
 * 
 * @throws PatternSyntaxException
 * 
 *             the pattern syntax exception if it has wrong syntax
 */
public static Pattern getPattern(String str, boolean cache) throws PatternSyntaxException {
    if (str == null) {
        return null;
    }
    // see if we made it before...
    Pattern p = compiledPats.get(str);
    if (p != null) {
        return p;
    }
    Matcher mat = patExtractor.matcher(str);
    if (mat.matches() == false) {
        throw new PatternSyntaxException("Invalid syntax provided", str, -1);
    }
    String regex = mat.group(1);
    String opts = mat.group(2);
    int optsVal = 0;
    if (StringUtils.contains(opts, "i")) {
        optsVal |= Pattern.CASE_INSENSITIVE;
    }
    if (StringUtils.contains(opts, "u")) {
        optsVal |= Pattern.UNICODE_CASE;
    }
    if (StringUtils.contains(opts, "m")) {
        optsVal |= Pattern.MULTILINE;
    }
    if (StringUtils.contains(opts, "s")) {
        optsVal |= Pattern.DOTALL;
    }
    // compile and store it
    p = Pattern.compile(regex, optsVal);
    if (cache) {
        compiledPats.put(str, p);
    }
    return p;
}

From source file:org.lanes.utility.string.TextNormaliser.java

public static List<String> cleanLightHTML(String html) {

    html = html.replaceAll("&nbsp;", " ");
    html = html.replaceAll("[\\{\\}\\[\\]]", "");
    html = html.replaceAll("&amp;", "&");
    html = html.replaceAll("(?i)<div.*?>(.*?)<\\/div>", "$1\n");
    html = html.replaceAll("(?i)<strong.*?>(.*?)<\\/strong>", "[$1] ");
    html = html.replaceAll("(?i)<br\\/?>", "\n");//MUST COME BEFORE <b>
    html = html.replaceAll("(?i)<b.*?>(.*?)<\\/b>", "[$1] ");
    html = html.replaceAll("(?i)<em>(.*?)<\\/em>", "[$1] ");
    html = html.replaceAll("(?i)<i>(.*?)<\\/i>", "[$1] ");
    html = html.replaceAll("(?i)<u>(.*?)<\\/u>", "[$1] ");

    html = html.replaceAll("[\\s\\n]+\\]", "]");
    html = html.replaceAll("\\[[\\s\\n]+", "[");
    html = html.replaceAll("[\\s]*:\\]", "]");
    html = html.replaceAll("(?i)<[\\/]?[uo]l.*?>", "");

    html = html.replaceAll("(?i)<li.*?>(.+?)(?=<li>)", "{$1}\n");
    html = html.replaceAll("(?i)<li.*?>(.+?)\\n", "{$1}\n");
    html = html.replaceAll("(?i)<\\/li>", " ");
    html = html.replaceAll("(?i)<[\\/]?div.*?>", " ");
    html = html.replaceAll("(?i)<\\/?center>", " ");
    html = html.replaceAll("(?i)<\\/?p.*?>", " ");
    html = html.replaceAll("(?i)<\\/?li>", " ");
    html = html.replaceAll("(?i)<\\/?font.*?>", " ");
    html = html.replaceAll("(?i)<\\/?hr.*?>", " ");
    html = html.replaceAll("\\[\\]", "");

    Pattern pattern = Pattern.compile("[\u00B7\u2022]\\s*(.+?)\n",
            (Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE));
    Matcher matcher = pattern.matcher(html);
    html = matcher.replaceAll("{$1}\n");

    html = html.replaceAll("\\s\\}", "}");

    html = html.replaceAll("(?i)(?:[\\w\\.]+)@(?:[\\w]+\\.)+(?:[\\w]+)", "<EMAIL>");
    html = html.replaceAll("(?i)(?:http:\\/\\/)?(?:[\\w]+\\.)+(?:[\\w]+)", "<URL>");
    html = html.replaceAll("\\s*\\/\\s*", ", ");

    //html = html.replaceAll("\\s+", " ");

    html = Normalizer.normalize(html, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");

    List<String> lineobj = new ArrayList<String>();
    String[] lines = html.split("\\n");
    for (String line : lines) {
        line = line.trim();/*  w  ww  .  ja  v  a 2 s.c o m*/
        if (!line.equals("")) {
            lineobj.add(line);
        }
    }

    return lineobj;
}

From source file:org.kurento.room.demo.FixedNKmsManager.java

public synchronized void setAuthRegex(String regex) {
    this.authRegex = regex != null ? regex.trim() : null;
    if (authRegex != null && !authRegex.isEmpty()) {
        authPattern = Pattern.compile(authRegex, Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE);
    }// w  ww  .jav  a  2  s .c o  m
}

From source file:com.romeikat.datamessie.core.processing.service.fulltext.query.QueryUtil.java

public FullTextQuery parseQuery(final String luceneQueryString, final Analyzer analyzer) {
    LOG.debug("Parsing query: {}", luceneQueryString);
    // Check if query is "n outof abc def ghi"
    final Pattern pattern = Pattern.compile("\\s*(\\d+)\\s+outof\\s+(.*)",
            Pattern.DOTALL | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
    final Matcher matcher = pattern.matcher(luceneQueryString);
    // Match => OUTOF query
    if (matcher.matches()) {
        final int k = Integer.parseInt(matcher.group(1));
        final String searchString = matcher.group(2);
        final List<String> queryTerms = parseUtil.parseTerms(searchString, analyzer, true);
        final OutOfQuery query = new OutOfQuery(k, queryTerms);
        LOG.debug("Detected {}", query);
        return query;
    }//from ww  w .j av  a 2  s .  c om
    // No match => Lucene query
    else {
        final LuceneQuery query = new LuceneQuery(luceneQueryString);
        LOG.debug("Detected {}", query);
        return query;
    }
}

From source file:GIST.IzbirkomExtractor.AbbrList.java

/**
 * Adds an abbreviation and a set of its expansions to abbreviation list.
 * /*from   ww  w  . j  av a  2s. c o  m*/
 * @param abbr_string
 * @param expansions
 */
protected void addAbbrev(String abbr_string, String[] expansions) {

    if (!abbrevs.containsKey(abbr_string)) {
        Abbreviation abbr = Abbreviation.createAbbreviation(abbr_string);
        Pattern pat = Pattern.compile("\\b" + abbr_string + "\\b",
                Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
        // FIXME: ignore case flag does not seem to work because abbreviations are retrieved by original case
        abbr.setPattern(pat);
        abbrevs.put(abbr_string, abbr);
    }
    abbrevs.get(abbr_string).addExpandions(expansions);

    expansionsPattern = abbreviationsPattern = null; /* reset the pattern to indicate modification  of the abbreviation list */
}

From source file:Normalization.TextNormalization.java

public String removeMentionsFromString(String content) {

    String utf8tweet = "";
    try {/*from w  w w  .  j av  a2 s  . c o m*/
        byte[] utf8Bytes = content.getBytes("UTF-8");

        utf8tweet = new String(utf8Bytes, "UTF-8");
    } catch (UnsupportedEncodingException e) {
    }

    final String regex = "[@]\\w+";
    final Pattern unicodeOutliers = Pattern.compile(regex,
            Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);

    Matcher unicodeOutlierMatcher = unicodeOutliers.matcher(utf8tweet);
    utf8tweet = unicodeOutlierMatcher.replaceAll("");
    return utf8tweet.replace("#", "");
}

From source file:tvbrowser.core.search.regexsearch.RegexSearcher.java

/**
 * Creates a pattern for a regular expression.
 *
 * @param regex The regular expression/*from   www  .  j  a v  a  2 s  .  c  o m*/
 * @param caseSensitive Should the search be case sensitive?
 * @return The pattern
 * @throws TvBrowserException If there is a syntax error in the regular expression.
 */
public static Pattern createSearchPattern(String regex, boolean caseSensitive) throws TvBrowserException {
    // Get the flags for the regex
    int flags = Pattern.DOTALL;
    if (!caseSensitive) {
        flags |= Pattern.CASE_INSENSITIVE;
        flags |= Pattern.UNICODE_CASE;
    }

    // Compile the regular expression
    Pattern pattern;
    try {
        pattern = Pattern.compile(regex, flags);
    } catch (PatternSyntaxException exc) {
        throw new TvBrowserException(RegexSearcher.class, "error.1",
                "Syntax error in the regular expression of the search pattern!", exc);
    }

    return pattern;
}

From source file:com.norconex.importer.handler.transformer.impl.StripAfterTransformer.java

@Override
protected void transformStringContent(String reference, StringBuilder content, ImporterMetadata metadata,
        boolean parsed, boolean partialContent) {
    if (stripAfterRegex == null) {
        LOG.error("No regular expression provided.");
        return;//from  w  ww  .java 2s. co  m
    }
    int flags = Pattern.DOTALL | Pattern.UNICODE_CASE;
    if (!caseSensitive) {
        flags = flags | Pattern.CASE_INSENSITIVE;
    }
    Pattern pattern = Pattern.compile(stripAfterRegex, flags);
    Matcher match = pattern.matcher(content);
    if (match.find()) {
        if (inclusive) {
            content.delete(match.start(), content.length());
        } else {
            content.delete(match.end(), content.length());
        }
    }
}

From source file:com.norconex.importer.handler.transformer.impl.StripBeforeTransformer.java

@Override
protected void transformStringContent(String reference, StringBuilder content, ImporterMetadata metadata,
        boolean parsed, boolean partialContent) {
    if (stripBeforeRegex == null) {
        LOG.error("No regular expression provided.");
        return;//from www  .j  a v a2s  .c  o m
    }
    int flags = Pattern.DOTALL | Pattern.UNICODE_CASE;
    if (!caseSensitive) {
        flags = flags | Pattern.CASE_INSENSITIVE;
    }
    Pattern pattern = Pattern.compile(stripBeforeRegex, flags);
    Matcher match = pattern.matcher(content);
    if (match.find()) {
        if (inclusive) {
            content.delete(0, match.end());
        } else {
            content.delete(0, match.start());
        }
    }
}

From source file:nz.net.orcon.kanban.automation.actions.RegexAction.java

public String extract(String text, String expressionString, int match, int group, String options)
        throws IOException {

    if (text == null) {
        text = "";
    }/*from w  w  w  .  j  a  v a  2  s.  c o m*/

    if (expressionString == null) {
        throw new IllegalArgumentException(
                "No Regular Expression has been provided to carry out this operation.");
    }

    int optionsInEffect = 0;
    if (options != null) {
        for (String option : options.toUpperCase().split("\\|")) {
            optionsInEffect |= (option.equals("CANON_EQ")) ? Pattern.CANON_EQ
                    : (option.equals("CASE_INSENSITIVE")) ? Pattern.CASE_INSENSITIVE
                            : (option.equals("COMMENTS")) ? Pattern.COMMENTS
                                    : (option.equals("DOTALL")) ? Pattern.DOTALL
                                            : (option.equals("LITERAL")) ? Pattern.LITERAL
                                                    : (option.equals("MULTILINE")) ? Pattern.MULTILINE
                                                            : (option.equals("UNICODE_CASE"))
                                                                    ? Pattern.UNICODE_CASE
                                                                    : (option.equals("UNIX_LINES"))
                                                                            ? Pattern.UNIX_LINES
                                                                            : 0;
        }
    }

    Pattern expression = Pattern.compile(expressionString, optionsInEffect);
    Matcher matches = expression.matcher(text);

    int matchIndex = 1;
    while (matches.find()) {
        for (int groupIndex = 0; matches.groupCount() + 1 > groupIndex; groupIndex++) {
            if (matchIndex == match && groupIndex == group) {
                return matches.group(groupIndex);
            }
        }
        matchIndex++;
    }

    return "";
}