Example usage for java.util.regex Pattern UNICODE_CHARACTER

Introduction

In this page you can find the example usage for java.util.regex Pattern UNICODE_CHARACTER_CLASS.

Prototype

int UNICODE_CHARACTER_CLASS

To view the source code for java.util.regex Pattern UNICODE_CHARACTER_CLASS.

Click Source Link

Document

Enables the Unicode version of Predefined character classes and POSIX character classes.

Usage

From source file:org.xbib.elasticsearch.index.analysis.langdetect.LangdetectService.java

@Override
protected void doStart() throws ElasticsearchException {
    load(settings);//w  w  w  .  j av  a  2s . co m
    this.priorMap = null;
    this.n_trial = settings.getAsInt("number_of_trials", 7);
    this.alpha = settings.getAsDouble("alpha", 0.5);
    this.alpha_width = settings.getAsDouble("alpha_width", 0.05);
    this.iteration_limit = settings.getAsInt("iteration_limit", 10000);
    this.prob_threshold = settings.getAsDouble("prob_threshold", 0.1);
    this.conv_threshold = settings.getAsDouble("conv_threshold", 0.99999);
    this.base_freq = settings.getAsInt("base_freq", 10000);
    this.filterPattern = settings.get("pattern") != null
            ? Pattern.compile(settings.get("pattern"), Pattern.UNICODE_CHARACTER_CLASS)
            : null;
}

From source file:io.bibleget.HTTPCaller.java

/**
 *
 * @param myQuery//from   w ww  .ja  v a2 s .  c  o  m
 * @param selectedVersions
 * @return
 * @throws java.lang.ClassNotFoundException
 * @throws java.io.UnsupportedEncodingException
 */
public boolean integrityCheck(String myQuery, List<String> selectedVersions)
        throws ClassNotFoundException, UnsupportedEncodingException {
    String versionsStr = StringUtils.join(selectedVersions.toArray(), ',');
    //System.out.println("Starting integrity check on query "+myQuery+" for versions: "+versionsStr);
    if (indexes == null) {
        indexes = Indexes.getInstance();
    }
    //build indexes based on versions

    //final result is true until proved false
    //set finFlag to false for non-breaking errors, or simply return false for breaking errors
    boolean finFlag = true;

    errorMessages.removeAll(errorMessages);
    List<String> queries = new ArrayList<>();

    //if english notation is found, translate to european notation
    if (myQuery.contains(":") && myQuery.contains(".")) {
        errorMessages.add(__(
                "Mixed notations have been detected. Please use either english notation or european notation."));
        return false;
    } else if (myQuery.contains(":")) {
        if (myQuery.contains(",")) {
            myQuery = myQuery.replace(",", ".");
        }
        myQuery = myQuery.replace(":", ",");
    }

    if (myQuery.isEmpty() == false) {
        if (myQuery.contains(";")) {
            //System.out.println("We have a semicolon");
            queries.addAll(Arrays.asList(myQuery.split(";")));
            for (Iterator<String> it = queries.iterator(); it.hasNext();) {
                if (it.next().isEmpty()) {
                    it.remove(); // NOTE: Iterator's remove method, not ArrayList's, is used.
                }
            }
        } else {
            //System.out.println("There is no semicolon");
            queries.add(myQuery);
        }
    }

    boolean first = true;
    String currBook = "";

    if (queries.isEmpty()) {
        errorMessages.add(__("You cannot send an empty query."));
        return false;
    }
    for (String querie : queries) {
        //System.out.println(querie);
        querie = toProperCase(querie);
        //System.out.println(querie);

        //RULE 1: at least the first query must have a book indicator
        if (first) {
            if (querie.matches("^[1-3]{0,1}((\\p{L}\\p{M}*)+)(.*)") == false) {
                errorMessages.add(MessageFormat.format(__(
                        "The first query <{0}> in the querystring <{1}> must start with a valid book indicator!"),
                        querie, myQuery));
                finFlag = false;
            }
            first = false;
        }

        //RULE 2: for every query that starts with a book indicator, 
        //        the book indicator must be followed by valid chapter indicator;
        //        else query must start with valid chapter indicator
        int bBooksContains;
        int myidx = -1;
        String tempBook = "";
        if (querie.matches("^[1-3]{0,1}((\\p{L}\\p{M}*)+)(.*)") == true) {
            //while we're at it, let's capture the book value from the query
            Pattern pattern = Pattern.compile("^[1-3]{0,1}((\\p{L}\\p{M}*)+)", Pattern.UNICODE_CHARACTER_CLASS);
            Matcher matcher = pattern.matcher(querie);
            if (matcher.find()) {
                tempBook = matcher.group();
                bBooksContains = isValidBook(tempBook);
                myidx = bBooksContains + 1;
                //if(bBooksContains == false && bBooksAbbrevsContains == false){
                if (bBooksContains == -1) {
                    errorMessages.add(MessageFormat.format(__(
                            "The book indicator <{0}> in the query <{1}> is not valid. Please check the documentation for a list of valid book indicators."),
                            tempBook, querie));
                    finFlag = false;
                } else {
                    //if(bBooksContains)
                    currBook = tempBook;
                    //querie = querie.replace(tempBook,"");
                }
            }

            Pattern pattern1 = Pattern.compile("^[1-3]{0,1}((\\p{L}\\p{M}*)+)",
                    Pattern.UNICODE_CHARACTER_CLASS);
            Pattern pattern2 = Pattern.compile("^[1-3]{0,1}((\\p{L}\\p{M}*)+)[1-9][0-9]{0,2}",
                    Pattern.UNICODE_CHARACTER_CLASS);
            Matcher matcher1 = pattern1.matcher(querie);
            Matcher matcher2 = pattern2.matcher(querie);
            int count1 = 0;
            while (matcher1.find()) {
                count1++;
            }
            int count2 = 0;
            while (matcher2.find()) {
                count2++;
            }
            if (querie.matches("^[1-3]{0,1}((\\p{L}\\p{M}*)+)[1-9][0-9]{0,2}(.*)") == false
                    || count1 != count2) {
                errorMessages.add(__("You must have a valid chapter following the book indicator!"));
                finFlag = false;
            }
            querie = querie.replace(tempBook, "");
        } else {
            if (querie.matches("^[1-9][0-9]{0,2}(.*)") == false) {
                errorMessages.add(__(
                        "A query that doesn't start with a book indicator must however start with a valid chapter indicator!"));
                finFlag = false;
            }
        }

        //RULE 3: Queries with a dot operator must first have a comma operator; and cannot have more commas than dots
        if (querie.contains(".")) {
            Pattern pattern11 = Pattern.compile("[,|\\-|\\.][1-9][0-9]{0,2}\\.");
            Matcher matcher11 = pattern11.matcher(querie);
            if (querie.contains(",") == false || matcher11.find() == false) {
                errorMessages.add(__(
                        "You cannot use a dot without first using a comma or a dash. A dot is a liason between verses, which are separated from the chapter by a comma."));
                finFlag = false;
            }
            Pattern pattern3 = Pattern.compile("(?<![0-9])(?=(([1-9][0-9]{0,2})\\.([1-9][0-9]{0,2})))");
            Matcher matcher3 = pattern3.matcher(querie);
            int count = 0;
            while (matcher3.find()) {
                //RULE 4: verse numbers around dot operators must be sequential
                if (Integer.parseInt(matcher3.group(2)) >= Integer.parseInt(matcher3.group(3))) {
                    errorMessages.add(MessageFormat.format(__(
                            "Verses concatenated by a dot must be consecutive, instead <{0}> is greater than or equal to <{1}> in the expression <{2}> in the query <{3}>"),
                            matcher3.group(2), matcher3.group(3), matcher3.group(1), querie));
                    finFlag = false;
                }
                count++;
            }
            //RULE 5: Dot operators must be preceded and followed by a number from one to three digits, of which the first digit cannot be a 0
            if (count == 0 || count != StringUtils.countMatches(querie, ".")) {
                errorMessages.add(__(
                        "A dot must be preceded and followed by 1 to 3 digits of which the first digit cannot be zero.")
                        + " <" + querie + ">");
                finFlag = false;
            }
        }

        //RULE 6: Comma operators must be preceded and followed by a number from one to three digits, of which the first digit cannot be 0
        if (querie.contains(",")) {

            Pattern pattern4 = Pattern.compile("([1-9][0-9]{0,2})\\,[1-9][0-9]{0,2}");
            Matcher matcher4 = pattern4.matcher(querie);
            int count = 0;
            List<Integer> chapters = new ArrayList<>();
            while (matcher4.find()) {
                //System.out.println("group0="+matcher4.group(0)+", group1="+matcher4.group(1));
                chapters.add(Integer.parseInt(matcher4.group(1)));
                count++;
            }
            if (count == 0 || count != StringUtils.countMatches(querie, ",")) {
                errorMessages.add(__(
                        "A comma must be preceded and followed by 1 to 3 digits of which the first digit cannot be zero.")
                        + " <" + querie + ">" + "(count=" + Integer.toString(count) + ",comma count="
                        + StringUtils.countMatches(querie, ",") + "); chapters=" + chapters.toString());
                finFlag = false;
            } else {
                // let's check the validity of the chapter numbers against the version indexes
                //for each chapter captured in the querystring
                for (int chapter : chapters) {
                    if (indexes.isValidChapter(chapter, myidx, selectedVersions) == false) {
                        int[] chapterLimit = indexes.getChapterLimit(myidx, selectedVersions);
                        errorMessages.add(MessageFormat.format(__(
                                "A chapter in the query is out of bounds: there is no chapter <{0}> in the book <{1}> in the requested version <{2}>, the last possible chapter is <{3}>"),
                                Integer.toString(chapter), currBook, StringUtils.join(selectedVersions, ","),
                                StringUtils.join(chapterLimit, ',')));
                        finFlag = false;
                    }
                }
            }
        }

        if (StringUtils.countMatches(querie, ",") > 1) {
            if (!querie.contains("-")) {
                errorMessages.add(__("You cannot have more than one comma and not have a dash!"));
                finFlag = false;
            }
            String[] parts = StringUtils.split(querie, "-");
            if (parts.length != 2) {
                errorMessages
                        .add(__("You seem to have a malformed querystring, there should be only one dash."));
                finFlag = false;
            }
            for (String p : parts) {
                Integer[] pp = new Integer[2];
                String[] tt = StringUtils.split(p, ",");
                int x = 0;
                for (String t : tt) {
                    pp[x++] = Integer.parseInt(t);
                }
                if (indexes.isValidChapter(pp[0], myidx, selectedVersions) == false) {
                    int[] chapterLimit;
                    chapterLimit = indexes.getChapterLimit(myidx, selectedVersions);
                    //                        System.out.print("chapterLimit = ");
                    //                        System.out.println(Arrays.toString(chapterLimit));
                    errorMessages.add(MessageFormat.format(__(
                            "A chapter in the query is out of bounds: there is no chapter <{0}> in the book <{1}> in the requested version <{2}>, the last possible chapter is <{3}>"),
                            Integer.toString(pp[0]), currBook, StringUtils.join(selectedVersions, ","),
                            StringUtils.join(chapterLimit, ',')));
                    finFlag = false;
                } else {
                    if (indexes.isValidVerse(pp[1], pp[0], myidx, selectedVersions) == false) {
                        int[] verseLimit = indexes.getVerseLimit(pp[0], myidx, selectedVersions);
                        //                            System.out.print("verseLimit = ");
                        //                            System.out.println(Arrays.toString(verseLimit));
                        errorMessages.add(MessageFormat.format(__(
                                "A verse in the query is out of bounds: there is no verse <{0}> in the book <{1}> at chapter <{2}> in the requested version <{3}>, the last possible verse is <{4}>"),
                                Integer.toString(pp[1]), currBook, Integer.toString(pp[0]),
                                StringUtils.join(selectedVersions, ","), StringUtils.join(verseLimit, ',')));
                        finFlag = false;
                    }
                }
            }
        } else if (StringUtils.countMatches(querie, ",") == 1) {
            String[] parts = StringUtils.split(querie, ",");
            //System.out.println(Arrays.toString(parts));
            if (indexes.isValidChapter(Integer.parseInt(parts[0]), myidx, selectedVersions) == false) {
                int[] chapterLimit = indexes.getChapterLimit(myidx, selectedVersions);
                errorMessages.add(MessageFormat.format(__(
                        "A chapter in the query is out of bounds: there is no chapter <{0}> in the book <{1}> in the requested version <{2}>, the last possible chapter is <{3}>"),
                        parts[0], currBook, StringUtils.join(selectedVersions, ","),
                        StringUtils.join(chapterLimit, ',')));
                finFlag = false;
            } else {
                if (parts[1].contains("-")) {
                    Deque<Integer> highverses = new ArrayDeque<>();
                    Pattern pattern11 = Pattern.compile("[,\\.][1-9][0-9]{0,2}\\-([1-9][0-9]{0,2})");
                    Matcher matcher11 = pattern11.matcher(querie);
                    while (matcher11.find()) {
                        highverses.push(Integer.parseInt(matcher11.group(1)));
                    }
                    int highverse = highverses.pop();
                    if (indexes.isValidVerse(highverse, Integer.parseInt(parts[0]), myidx,
                            selectedVersions) == false) {
                        int[] verseLimit = indexes.getVerseLimit(Integer.parseInt(parts[0]), myidx,
                                selectedVersions);
                        errorMessages.add(MessageFormat.format(__(
                                "A verse in the query is out of bounds: there is no verse <{0}> in the book <{1}> at chapter <{2}> in the requested version <{3}>, the last possible verse is <{4}>"),
                                highverse, currBook, parts[0], StringUtils.join(selectedVersions, ","),
                                StringUtils.join(verseLimit, ',')));
                        finFlag = false;
                    }
                } else {
                    Pattern pattern12 = Pattern.compile(",([1-9][0-9]{0,2})");
                    Matcher matcher12 = pattern12.matcher(querie);
                    int highverse = -1;
                    while (matcher12.find()) {
                        highverse = Integer.parseInt(matcher12.group(1));
                        //System.out.println("[line 376]:highverse="+Integer.toString(highverse));
                    }
                    if (highverse != -1) {
                        //System.out.println("Checking verse validity for book "+myidx+" chapter "+parts[0]+"...");
                        if (indexes.isValidVerse(highverse, Integer.parseInt(parts[0]), myidx,
                                selectedVersions) == false) {
                            int[] verseLimit = indexes.getVerseLimit(Integer.parseInt(parts[0]), myidx,
                                    selectedVersions);
                            errorMessages.add(MessageFormat.format(__(
                                    "A verse in the query is out of bounds: there is no verse <{0}> in the book <{1}> at chapter <{2}> in the requested version <{3}>, the last possible verse is <{4}>"),
                                    highverse, currBook, parts[0], StringUtils.join(selectedVersions, ","),
                                    StringUtils.join(verseLimit, ',')));
                            finFlag = false;
                        }
                    }
                }
                Pattern pattern13 = Pattern.compile("\\.([1-9][0-9]{0,2})$");
                Matcher matcher13 = pattern13.matcher(querie);
                int highverse = -1;
                while (matcher13.find()) {
                    highverse = Integer.parseInt(matcher13.group(1));
                }
                if (highverse != -1) {
                    if (indexes.isValidVerse(highverse, Integer.parseInt(parts[0]), myidx,
                            selectedVersions) == false) {
                        int[] verseLimit = indexes.getVerseLimit(Integer.parseInt(parts[0]), myidx,
                                selectedVersions);
                        errorMessages.add(MessageFormat.format(__(
                                "A verse in the query is out of bounds: there is no verse <{0}> in the book <{1}> at chapter <{2}> in the requested version <{3}>, the last possible verse is <{4}>"),
                                highverse, currBook, parts[0], StringUtils.join(selectedVersions, ","),
                                StringUtils.join(verseLimit, ',')));
                        finFlag = false;
                    }
                }
            }
        } else { //if there is no comma, it's either a single chapter or an extension of chapters with a dash
                 //System.out.println("no comma found");
            String[] parts = StringUtils.split(querie, "-");
            //System.out.println(Arrays.toString(parts));
            int highchapter = Integer.parseInt(parts[parts.length - 1]);
            if (indexes.isValidChapter(highchapter, myidx, selectedVersions) == false) {
                int[] chapterLimit = indexes.getChapterLimit(myidx, selectedVersions);
                errorMessages.add(MessageFormat.format(__(
                        "A chapter in the query is out of bounds: there is no chapter <{0}> in the book <{1}> in the requested version <{2}>, the last possible chapter is <{3}>"),
                        Integer.toString(highchapter), currBook, StringUtils.join(selectedVersions, ","),
                        StringUtils.join(chapterLimit, ',')));
                finFlag = false;
            }
        }

        if (querie.contains("-")) {
            //RULE 7: If there are multiple dashes in a query, there cannot be more dashes than there are dots minus 1
            int dashcount = StringUtils.countMatches(querie, "-");
            int dotcount = StringUtils.countMatches(querie, ".");
            if (dashcount > 1) {
                if (dashcount - 1 > dotcount) {
                    errorMessages.add(__(
                            "There are multiple dashes in the query, but there are not enough dots. There can only be one more dash than dots.")
                            + " <" + querie + ">");
                    finFlag = false;
                }
            }

            //RULE 8: Dash operators must be preceded and followed by a number from one to three digits, of which the first digit cannot be 0
            Pattern pattern5 = Pattern.compile("([1-9][0-9]{0,2}\\-[1-9][0-9]{0,2})");
            Matcher matcher5 = pattern5.matcher(querie);
            int count = 0;
            while (matcher5.find()) {
                count++;
            }
            if (count == 0 || count != StringUtils.countMatches(querie, "-")) {
                errorMessages.add(__(
                        "A dash must be preceded and followed by 1 to 3 digits of which the first digit cannot be zero.")
                        + " <" + querie + ">");
                finFlag = false;
            }

            //RULE 9: If a comma construct follows a dash, there must also be a comma construct preceding the dash
            Pattern pattern6 = Pattern.compile("\\-([1-9][0-9]{0,2})\\,");
            Matcher matcher6 = pattern6.matcher(querie);
            if (matcher6.find()) {
                Pattern pattern7 = Pattern.compile("\\,[1-9][0-9]{0,2}\\-");
                Matcher matcher7 = pattern7.matcher(querie);
                if (matcher7.find() == false) {
                    errorMessages.add(__(
                            "If there is a chapter-verse construct following a dash, there must also be a chapter-verse construct preceding the same dash.")
                            + " <" + querie + ">");
                    finFlag = false;
                } else {
                    //RULE 10: Chapters before and after dashes must be sequential
                    int chap1 = -1;
                    int chap2 = -1;

                    Pattern pattern8 = Pattern.compile("([1-9][0-9]{0,2})\\,[1-9][0-9]{0,2}\\-");
                    Matcher matcher8 = pattern8.matcher(querie);
                    if (matcher8.find()) {
                        chap1 = Integer.parseInt(matcher8.group(1));
                    }
                    Pattern pattern9 = Pattern.compile("\\-([1-9][0-9]{0,2})\\,");
                    Matcher matcher9 = pattern9.matcher(querie);
                    if (matcher9.find()) {
                        chap2 = Integer.parseInt(matcher9.group(1));
                    }

                    if (chap1 >= chap2) {
                        errorMessages.add(MessageFormat.format(__(
                                "Chapters must be consecutive. Instead the first chapter indicator <{0}> is greater than or equal to the second chapter indicator <{1}> in the expression <{2}>"),
                                chap1, chap2, querie));
                        finFlag = false;
                    }
                }
            } else {
                //if there are no comma constructs immediately following the dash
                //RULE 11: Verses (or chapters if applicable) around each of the dash operator(s) must be sequential
                Pattern pattern10 = Pattern.compile("([1-9][0-9]{0,2})\\-([1-9][0-9]{0,2})");
                Matcher matcher10 = pattern10.matcher(querie);
                while (matcher10.find()) {
                    int num1 = Integer.parseInt(matcher10.group(1));
                    int num2 = Integer.parseInt(matcher10.group(2));
                    if (num1 >= num2) {
                        errorMessages.add(MessageFormat.format(__(
                                "Verses (or chapters if applicable) around the dash operator must be consecutive. Instead <{0}> is greater than or equal to <{1}> in the expression <{2}>"),
                                num1, num2, querie));
                        finFlag = false;
                    }
                }

            }
        }

    }

    return finFlag;
}

From source file:com.joliciel.talismane.tokeniser.filters.TokenRegexFilterImpl.java

Pattern getPattern() {
    if (pattern == null) {
        // we may need to replace WordLists by the list contents
        String myRegex = this.regex;

        if (LOG.isTraceEnabled()) {
            LOG.trace("Regex: " + myRegex);
        }//from  w ww  .j ava2s  .co  m

        if (this.autoWordBoundaries) {
            Boolean startsWithLetter = null;
            for (int i = 0; i < myRegex.length() && startsWithLetter == null; i++) {
                char c = myRegex.charAt(i);
                if (c == '\\') {
                    i++;
                    c = myRegex.charAt(i);
                    if (c == 'd' || c == 'w') {
                        startsWithLetter = true;
                    } else if (c == 's' || c == 'W' || c == 'b' || c == 'B') {
                        startsWithLetter = false;
                    } else if (c == 'p') {
                        i += 2; // skip the open curly brackets
                        int closeCurlyBrackets = myRegex.indexOf('}', i);
                        int openParentheses = myRegex.indexOf('(', i);
                        int endIndex = closeCurlyBrackets;
                        if (openParentheses > 0 && openParentheses < closeCurlyBrackets)
                            endIndex = openParentheses;
                        if (endIndex > 0) {
                            String specialClass = myRegex.substring(i, endIndex);
                            if (specialClass.equals("WordList")) {
                                startsWithLetter = true;
                            }
                        }
                    }
                    break;
                } else if (c == '[' || c == '(') {
                    // do nothing
                } else if (Character.isLetter(c) || Character.isDigit(c)) {
                    startsWithLetter = true;
                } else {
                    startsWithLetter = false;
                }
            }

            Boolean endsWithLetter = null;
            for (int i = myRegex.length() - 1; i >= 0 && endsWithLetter == null; i--) {
                char c = myRegex.charAt(i);
                char prevC = ' ';
                if (i >= 1)
                    prevC = myRegex.charAt(i - 1);
                if (prevC == '\\') {
                    if (c == 'd' || c == 'w') {
                        endsWithLetter = true;
                    } else if (c == 's' || c == 'W' || c == 'b' || c == 'B') {
                        endsWithLetter = false;
                    } else if (c == 'p') {
                        i += 2; // skip the open curly brackets
                        int closeCurlyBrackets = myRegex.indexOf('}', i);
                        int openParentheses = myRegex.indexOf('(', i);
                        int endIndex = closeCurlyBrackets;
                        if (openParentheses < closeCurlyBrackets)
                            endIndex = openParentheses;
                        if (endIndex > 0) {
                            String specialClass = myRegex.substring(i, endIndex);
                            if (specialClass.equals("WordList") || specialClass.equals("Alpha")
                                    || specialClass.equals("Lower") || specialClass.equals("Upper")
                                    || specialClass.equals("ASCII") || specialClass.equals("Digit")) {
                                startsWithLetter = true;
                            }
                        }
                    }
                    break;
                } else if (c == ']' || c == ')' || c == '+') {
                    // do nothing
                } else if (c == '}') {
                    int startIndex = myRegex.lastIndexOf('{') + 1;
                    int closeCurlyBrackets = myRegex.indexOf('}', startIndex);
                    int openParentheses = myRegex.indexOf('(', startIndex);
                    int endIndex = closeCurlyBrackets;
                    if (openParentheses > 0 && openParentheses < closeCurlyBrackets)
                        endIndex = openParentheses;
                    if (endIndex > 0) {
                        String specialClass = myRegex.substring(startIndex, endIndex);
                        if (specialClass.equals("WordList") || specialClass.equals("Alpha")
                                || specialClass.equals("Lower") || specialClass.equals("Upper")
                                || specialClass.equals("ASCII") || specialClass.equals("Digit")) {
                            endsWithLetter = true;
                        }
                    }
                    break;
                } else if (Character.isLetter(c) || Character.isDigit(c)) {
                    endsWithLetter = true;
                } else {
                    endsWithLetter = false;
                }
            }

            if (startsWithLetter != null && startsWithLetter) {
                myRegex = "\\b" + myRegex;
            }
            if (endsWithLetter != null && endsWithLetter) {
                myRegex = myRegex + "\\b";
            }
            if (LOG.isTraceEnabled()) {
                LOG.trace("After autoWordBoundaries: " + myRegex);
            }
        }

        if (!this.caseSensitive || !this.diacriticSensitive) {
            StringBuilder regexBuilder = new StringBuilder();
            for (int i = 0; i < myRegex.length(); i++) {
                char c = myRegex.charAt(i);
                if (c == '\\') {
                    // escape - skip next
                    regexBuilder.append(c);
                    i++;
                    c = myRegex.charAt(i);
                    regexBuilder.append(c);
                } else if (c == '[') {
                    // character group, don't change it
                    regexBuilder.append(c);
                    while (c != ']' && i < myRegex.length()) {
                        i++;
                        c = myRegex.charAt(i);
                        regexBuilder.append(c);
                    }
                } else if (c == '{') {
                    // command, don't change it
                    regexBuilder.append(c);
                    while (c != '}' && i < myRegex.length()) {
                        i++;
                        c = myRegex.charAt(i);
                        regexBuilder.append(c);
                    }
                } else if (Character.isLetter(c)) {
                    Set<String> chars = new TreeSet<String>();
                    chars.add("" + c);
                    char noAccent = diacriticPattern.matcher(Normalizer.normalize("" + c, Form.NFD))
                            .replaceAll("").charAt(0);

                    if (!this.caseSensitive) {
                        chars.add("" + Character.toUpperCase(c));
                        chars.add("" + Character.toLowerCase(c));
                        chars.add("" + Character.toUpperCase(noAccent));
                    }
                    if (!this.diacriticSensitive) {
                        chars.add("" + noAccent);
                        if (!this.caseSensitive) {
                            chars.add("" + Character.toLowerCase(noAccent));
                        }
                    }
                    if (chars.size() == 1) {
                        regexBuilder.append(c);
                    } else {
                        regexBuilder.append('[');
                        for (String oneChar : chars) {
                            regexBuilder.append(oneChar);
                        }
                        regexBuilder.append(']');
                    }
                } else {
                    regexBuilder.append(c);
                }
            }
            myRegex = regexBuilder.toString();
            if (LOG.isTraceEnabled()) {
                LOG.trace("After caseSensitive: " + myRegex);
            }
        }

        Matcher matcher = wordListPattern.matcher(myRegex);
        StringBuilder regexBuilder = new StringBuilder();

        int lastIndex = 0;
        while (matcher.find()) {
            String[] params = matcher.group(1).split(",");
            int start = matcher.start();
            int end = matcher.end();
            regexBuilder.append(myRegex.substring(lastIndex, start));

            String wordListName = params[0];
            boolean uppercaseOptional = false;
            boolean diacriticsOptional = false;
            boolean lowercaseOptional = false;
            boolean firstParam = true;
            for (String param : params) {
                if (firstParam) {
                    /* word list name */ } else if (param.equals("diacriticsOptional"))
                    diacriticsOptional = true;
                else if (param.equals("uppercaseOptional"))
                    uppercaseOptional = true;
                else if (param.equals("lowercaseOptional"))
                    lowercaseOptional = true;
                else
                    throw new TalismaneException(
                            "Unknown parameter in word list " + matcher.group(1) + ": " + param);
                firstParam = false;
            }

            ExternalWordList wordList = externalResourceFinder.getExternalWordList(wordListName);
            if (wordList == null)
                throw new TalismaneException("Unknown word list: " + wordListName);

            StringBuilder sb = new StringBuilder();

            boolean firstWord = true;
            for (String word : wordList.getWordList()) {
                if (!firstWord)
                    sb.append("|");
                word = Normalizer.normalize(word, Form.NFC);
                if (uppercaseOptional || diacriticsOptional) {
                    String wordNoDiacritics = Normalizer.normalize(word, Form.NFD)
                            .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
                    String wordLowercase = word.toLowerCase(Locale.ENGLISH);
                    String wordLowercaseNoDiacritics = Normalizer.normalize(wordLowercase, Form.NFD)
                            .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
                    String wordUppercase = wordNoDiacritics.toUpperCase(Locale.ENGLISH);

                    boolean needsGrouping = false;
                    if (uppercaseOptional && !word.equals(wordLowercase))
                        needsGrouping = true;
                    if (diacriticsOptional && !word.equals(wordNoDiacritics))
                        needsGrouping = true;
                    if (lowercaseOptional && !word.equals(wordUppercase))
                        needsGrouping = true;
                    if (needsGrouping) {
                        for (int i = 0; i < word.length(); i++) {
                            char c = word.charAt(i);

                            boolean grouped = false;
                            if (uppercaseOptional && c != wordLowercase.charAt(i))
                                grouped = true;
                            if (diacriticsOptional && c != wordNoDiacritics.charAt(i))
                                grouped = true;
                            if (lowercaseOptional && c != wordUppercase.charAt(i))
                                grouped = true;

                            if (!grouped)
                                sb.append(c);
                            else {
                                sb.append("[");
                                String group = "" + c;
                                if (uppercaseOptional && group.indexOf(wordLowercase.charAt(i)) < 0)
                                    group += (wordLowercase.charAt(i));
                                if (lowercaseOptional && group.indexOf(wordUppercase.charAt(i)) < 0)
                                    group += (wordUppercase.charAt(i));
                                if (diacriticsOptional && group.indexOf(wordNoDiacritics.charAt(i)) < 0)
                                    group += (wordNoDiacritics.charAt(i));
                                if (uppercaseOptional && diacriticsOptional
                                        && group.indexOf(wordLowercaseNoDiacritics.charAt(i)) < 0)
                                    group += (wordLowercaseNoDiacritics.charAt(i));

                                sb.append(group);
                                sb.append("]");
                            } // does this letter need grouping?
                        } // next letter
                    } else {
                        sb.append(word);
                    } // any options activated?
                } else {
                    sb.append(word);
                }
                firstWord = false;
            } // next word in list

            regexBuilder.append(sb.toString());
            lastIndex = end;
        } // next match
        regexBuilder.append(myRegex.substring(lastIndex));
        myRegex = regexBuilder.toString();
        this.pattern = Pattern.compile(myRegex, Pattern.UNICODE_CHARACTER_CLASS);
    }
    return pattern;
}

From source file:com.screenslicer.core.util.Util.java

public static List<String> transformUrlStrings(List<String> urls, UrlTransform[] urlTransforms,
        boolean forExport) {
    List<String> newUrls = new ArrayList<String>();
    if (urlTransforms != null && urlTransforms.length != 0 && urls != null) {
        for (String url : urls) {
            String newUrl = url;//from  w w w.  java 2 s. c o m
            for (int i = 0; urlTransforms != null && i < urlTransforms.length; i++) {
                if (!CommonUtil.isEmpty(urlTransforms[i].regex) && newUrl != null && urlTransforms[i] != null
                        && ((forExport && urlTransforms[i].transformForExportOnly)
                                || (!forExport && !urlTransforms[i].transformForExportOnly))) {
                    Pattern pattern = Pattern.compile(urlTransforms[i].regex,
                            Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
                    Matcher matcher = pattern.matcher(newUrl);
                    if (matcher.find()) {
                        if (urlTransforms[i].replaceAll) {
                            if (urlTransforms[i].replaceAllRecursive) {
                                String transformed = matcher.replaceAll(urlTransforms[i].replacement);
                                String transformedRec = pattern.matcher(transformed)
                                        .replaceAll(urlTransforms[i].replacement);
                                while (!transformed.equals(transformedRec)) {
                                    transformed = transformedRec;
                                    transformedRec = pattern.matcher(transformedRec)
                                            .replaceAll(urlTransforms[i].replacement);
                                }
                                newUrl = transformed;
                            } else {
                                newUrl = matcher.replaceAll(urlTransforms[i].replacement);
                            }
                        } else {
                            newUrl = matcher.replaceFirst(urlTransforms[i].replacement);
                        }
                        if (!urlTransforms[i].multipleTransforms) {
                            break;
                        }
                    }
                }
            }
            newUrls.add(newUrl);
        }
    } else {
        return urls;
    }
    return newUrls;
}

From source file:no.kantega.publishing.common.ContentIdHelperImpl.java

@Override
public void setServletContext(ServletContext servletContext) {
    CONTENT_URL_PATTERN = Pattern.compile(
            ContentPatterns.getPatternWithContextPath(servletContext.getContextPath()),
            Pattern.UNICODE_CHARACTER_CLASS);
}

From source file:org.apache.nifi.processors.standard.EvaluateRegularExpression.java

int getCompileFlags(ProcessContext context) {
    int flags = (context.getProperty(UNIX_LINES).asBoolean() ? Pattern.UNIX_LINES : 0)
            | (context.getProperty(CASE_INSENSITIVE).asBoolean() ? Pattern.CASE_INSENSITIVE : 0)
            | (context.getProperty(COMMENTS).asBoolean() ? Pattern.COMMENTS : 0)
            | (context.getProperty(MULTILINE).asBoolean() ? Pattern.MULTILINE : 0)
            | (context.getProperty(LITERAL).asBoolean() ? Pattern.LITERAL : 0)
            | (context.getProperty(DOTALL).asBoolean() ? Pattern.DOTALL : 0)
            | (context.getProperty(UNICODE_CASE).asBoolean() ? Pattern.UNICODE_CASE : 0)
            | (context.getProperty(CANON_EQ).asBoolean() ? Pattern.CANON_EQ : 0)
            | (context.getProperty(UNICODE_CHARACTER_CLASS).asBoolean() ? Pattern.UNICODE_CHARACTER_CLASS : 0);
    return flags;
}

Example usage for java.util.regex Pattern UNICODE_CHARACTER_CLASS

Introduction

Prototype

Document

Usage