List of usage examples for java.util.regex Pattern UNICODE_CHARACTER_CLASS
int UNICODE_CHARACTER_CLASS
To view the source code for java.util.regex Pattern UNICODE_CHARACTER_CLASS.
Click Source Link
From source file:org.xbib.elasticsearch.index.analysis.langdetect.LangdetectService.java
@Override protected void doStart() throws ElasticsearchException { load(settings);//w w w . j av a 2s . co m this.priorMap = null; this.n_trial = settings.getAsInt("number_of_trials", 7); this.alpha = settings.getAsDouble("alpha", 0.5); this.alpha_width = settings.getAsDouble("alpha_width", 0.05); this.iteration_limit = settings.getAsInt("iteration_limit", 10000); this.prob_threshold = settings.getAsDouble("prob_threshold", 0.1); this.conv_threshold = settings.getAsDouble("conv_threshold", 0.99999); this.base_freq = settings.getAsInt("base_freq", 10000); this.filterPattern = settings.get("pattern") != null ? Pattern.compile(settings.get("pattern"), Pattern.UNICODE_CHARACTER_CLASS) : null; }
From source file:io.bibleget.HTTPCaller.java
/** * * @param myQuery//from w ww .ja v a2 s . c o m * @param selectedVersions * @return * @throws java.lang.ClassNotFoundException * @throws java.io.UnsupportedEncodingException */ public boolean integrityCheck(String myQuery, List<String> selectedVersions) throws ClassNotFoundException, UnsupportedEncodingException { String versionsStr = StringUtils.join(selectedVersions.toArray(), ','); //System.out.println("Starting integrity check on query "+myQuery+" for versions: "+versionsStr); if (indexes == null) { indexes = Indexes.getInstance(); } //build indexes based on versions //final result is true until proved false //set finFlag to false for non-breaking errors, or simply return false for breaking errors boolean finFlag = true; errorMessages.removeAll(errorMessages); List<String> queries = new ArrayList<>(); //if english notation is found, translate to european notation if (myQuery.contains(":") && myQuery.contains(".")) { errorMessages.add(__( "Mixed notations have been detected. Please use either english notation or european notation.")); return false; } else if (myQuery.contains(":")) { if (myQuery.contains(",")) { myQuery = myQuery.replace(",", "."); } myQuery = myQuery.replace(":", ","); } if (myQuery.isEmpty() == false) { if (myQuery.contains(";")) { //System.out.println("We have a semicolon"); queries.addAll(Arrays.asList(myQuery.split(";"))); for (Iterator<String> it = queries.iterator(); it.hasNext();) { if (it.next().isEmpty()) { it.remove(); // NOTE: Iterator's remove method, not ArrayList's, is used. } } } else { //System.out.println("There is no semicolon"); queries.add(myQuery); } } boolean first = true; String currBook = ""; if (queries.isEmpty()) { errorMessages.add(__("You cannot send an empty query.")); return false; } for (String querie : queries) { //System.out.println(querie); querie = toProperCase(querie); //System.out.println(querie); //RULE 1: at least the first query must have a book indicator if (first) { if (querie.matches("^[1-3]{0,1}((\\p{L}\\p{M}*)+)(.*)") == false) { errorMessages.add(MessageFormat.format(__( "The first query <{0}> in the querystring <{1}> must start with a valid book indicator!"), querie, myQuery)); finFlag = false; } first = false; } //RULE 2: for every query that starts with a book indicator, // the book indicator must be followed by valid chapter indicator; // else query must start with valid chapter indicator int bBooksContains; int myidx = -1; String tempBook = ""; if (querie.matches("^[1-3]{0,1}((\\p{L}\\p{M}*)+)(.*)") == true) { //while we're at it, let's capture the book value from the query Pattern pattern = Pattern.compile("^[1-3]{0,1}((\\p{L}\\p{M}*)+)", Pattern.UNICODE_CHARACTER_CLASS); Matcher matcher = pattern.matcher(querie); if (matcher.find()) { tempBook = matcher.group(); bBooksContains = isValidBook(tempBook); myidx = bBooksContains + 1; //if(bBooksContains == false && bBooksAbbrevsContains == false){ if (bBooksContains == -1) { errorMessages.add(MessageFormat.format(__( "The book indicator <{0}> in the query <{1}> is not valid. Please check the documentation for a list of valid book indicators."), tempBook, querie)); finFlag = false; } else { //if(bBooksContains) currBook = tempBook; //querie = querie.replace(tempBook,""); } } Pattern pattern1 = Pattern.compile("^[1-3]{0,1}((\\p{L}\\p{M}*)+)", Pattern.UNICODE_CHARACTER_CLASS); Pattern pattern2 = Pattern.compile("^[1-3]{0,1}((\\p{L}\\p{M}*)+)[1-9][0-9]{0,2}", Pattern.UNICODE_CHARACTER_CLASS); Matcher matcher1 = pattern1.matcher(querie); Matcher matcher2 = pattern2.matcher(querie); int count1 = 0; while (matcher1.find()) { count1++; } int count2 = 0; while (matcher2.find()) { count2++; } if (querie.matches("^[1-3]{0,1}((\\p{L}\\p{M}*)+)[1-9][0-9]{0,2}(.*)") == false || count1 != count2) { errorMessages.add(__("You must have a valid chapter following the book indicator!")); finFlag = false; } querie = querie.replace(tempBook, ""); } else { if (querie.matches("^[1-9][0-9]{0,2}(.*)") == false) { errorMessages.add(__( "A query that doesn't start with a book indicator must however start with a valid chapter indicator!")); finFlag = false; } } //RULE 3: Queries with a dot operator must first have a comma operator; and cannot have more commas than dots if (querie.contains(".")) { Pattern pattern11 = Pattern.compile("[,|\\-|\\.][1-9][0-9]{0,2}\\."); Matcher matcher11 = pattern11.matcher(querie); if (querie.contains(",") == false || matcher11.find() == false) { errorMessages.add(__( "You cannot use a dot without first using a comma or a dash. A dot is a liason between verses, which are separated from the chapter by a comma.")); finFlag = false; } Pattern pattern3 = Pattern.compile("(?<![0-9])(?=(([1-9][0-9]{0,2})\\.([1-9][0-9]{0,2})))"); Matcher matcher3 = pattern3.matcher(querie); int count = 0; while (matcher3.find()) { //RULE 4: verse numbers around dot operators must be sequential if (Integer.parseInt(matcher3.group(2)) >= Integer.parseInt(matcher3.group(3))) { errorMessages.add(MessageFormat.format(__( "Verses concatenated by a dot must be consecutive, instead <{0}> is greater than or equal to <{1}> in the expression <{2}> in the query <{3}>"), matcher3.group(2), matcher3.group(3), matcher3.group(1), querie)); finFlag = false; } count++; } //RULE 5: Dot operators must be preceded and followed by a number from one to three digits, of which the first digit cannot be a 0 if (count == 0 || count != StringUtils.countMatches(querie, ".")) { errorMessages.add(__( "A dot must be preceded and followed by 1 to 3 digits of which the first digit cannot be zero.") + " <" + querie + ">"); finFlag = false; } } //RULE 6: Comma operators must be preceded and followed by a number from one to three digits, of which the first digit cannot be 0 if (querie.contains(",")) { Pattern pattern4 = Pattern.compile("([1-9][0-9]{0,2})\\,[1-9][0-9]{0,2}"); Matcher matcher4 = pattern4.matcher(querie); int count = 0; List<Integer> chapters = new ArrayList<>(); while (matcher4.find()) { //System.out.println("group0="+matcher4.group(0)+", group1="+matcher4.group(1)); chapters.add(Integer.parseInt(matcher4.group(1))); count++; } if (count == 0 || count != StringUtils.countMatches(querie, ",")) { errorMessages.add(__( "A comma must be preceded and followed by 1 to 3 digits of which the first digit cannot be zero.") + " <" + querie + ">" + "(count=" + Integer.toString(count) + ",comma count=" + StringUtils.countMatches(querie, ",") + "); chapters=" + chapters.toString()); finFlag = false; } else { // let's check the validity of the chapter numbers against the version indexes //for each chapter captured in the querystring for (int chapter : chapters) { if (indexes.isValidChapter(chapter, myidx, selectedVersions) == false) { int[] chapterLimit = indexes.getChapterLimit(myidx, selectedVersions); errorMessages.add(MessageFormat.format(__( "A chapter in the query is out of bounds: there is no chapter <{0}> in the book <{1}> in the requested version <{2}>, the last possible chapter is <{3}>"), Integer.toString(chapter), currBook, StringUtils.join(selectedVersions, ","), StringUtils.join(chapterLimit, ','))); finFlag = false; } } } } if (StringUtils.countMatches(querie, ",") > 1) { if (!querie.contains("-")) { errorMessages.add(__("You cannot have more than one comma and not have a dash!")); finFlag = false; } String[] parts = StringUtils.split(querie, "-"); if (parts.length != 2) { errorMessages .add(__("You seem to have a malformed querystring, there should be only one dash.")); finFlag = false; } for (String p : parts) { Integer[] pp = new Integer[2]; String[] tt = StringUtils.split(p, ","); int x = 0; for (String t : tt) { pp[x++] = Integer.parseInt(t); } if (indexes.isValidChapter(pp[0], myidx, selectedVersions) == false) { int[] chapterLimit; chapterLimit = indexes.getChapterLimit(myidx, selectedVersions); // System.out.print("chapterLimit = "); // System.out.println(Arrays.toString(chapterLimit)); errorMessages.add(MessageFormat.format(__( "A chapter in the query is out of bounds: there is no chapter <{0}> in the book <{1}> in the requested version <{2}>, the last possible chapter is <{3}>"), Integer.toString(pp[0]), currBook, StringUtils.join(selectedVersions, ","), StringUtils.join(chapterLimit, ','))); finFlag = false; } else { if (indexes.isValidVerse(pp[1], pp[0], myidx, selectedVersions) == false) { int[] verseLimit = indexes.getVerseLimit(pp[0], myidx, selectedVersions); // System.out.print("verseLimit = "); // System.out.println(Arrays.toString(verseLimit)); errorMessages.add(MessageFormat.format(__( "A verse in the query is out of bounds: there is no verse <{0}> in the book <{1}> at chapter <{2}> in the requested version <{3}>, the last possible verse is <{4}>"), Integer.toString(pp[1]), currBook, Integer.toString(pp[0]), StringUtils.join(selectedVersions, ","), StringUtils.join(verseLimit, ','))); finFlag = false; } } } } else if (StringUtils.countMatches(querie, ",") == 1) { String[] parts = StringUtils.split(querie, ","); //System.out.println(Arrays.toString(parts)); if (indexes.isValidChapter(Integer.parseInt(parts[0]), myidx, selectedVersions) == false) { int[] chapterLimit = indexes.getChapterLimit(myidx, selectedVersions); errorMessages.add(MessageFormat.format(__( "A chapter in the query is out of bounds: there is no chapter <{0}> in the book <{1}> in the requested version <{2}>, the last possible chapter is <{3}>"), parts[0], currBook, StringUtils.join(selectedVersions, ","), StringUtils.join(chapterLimit, ','))); finFlag = false; } else { if (parts[1].contains("-")) { Deque<Integer> highverses = new ArrayDeque<>(); Pattern pattern11 = Pattern.compile("[,\\.][1-9][0-9]{0,2}\\-([1-9][0-9]{0,2})"); Matcher matcher11 = pattern11.matcher(querie); while (matcher11.find()) { highverses.push(Integer.parseInt(matcher11.group(1))); } int highverse = highverses.pop(); if (indexes.isValidVerse(highverse, Integer.parseInt(parts[0]), myidx, selectedVersions) == false) { int[] verseLimit = indexes.getVerseLimit(Integer.parseInt(parts[0]), myidx, selectedVersions); errorMessages.add(MessageFormat.format(__( "A verse in the query is out of bounds: there is no verse <{0}> in the book <{1}> at chapter <{2}> in the requested version <{3}>, the last possible verse is <{4}>"), highverse, currBook, parts[0], StringUtils.join(selectedVersions, ","), StringUtils.join(verseLimit, ','))); finFlag = false; } } else { Pattern pattern12 = Pattern.compile(",([1-9][0-9]{0,2})"); Matcher matcher12 = pattern12.matcher(querie); int highverse = -1; while (matcher12.find()) { highverse = Integer.parseInt(matcher12.group(1)); //System.out.println("[line 376]:highverse="+Integer.toString(highverse)); } if (highverse != -1) { //System.out.println("Checking verse validity for book "+myidx+" chapter "+parts[0]+"..."); if (indexes.isValidVerse(highverse, Integer.parseInt(parts[0]), myidx, selectedVersions) == false) { int[] verseLimit = indexes.getVerseLimit(Integer.parseInt(parts[0]), myidx, selectedVersions); errorMessages.add(MessageFormat.format(__( "A verse in the query is out of bounds: there is no verse <{0}> in the book <{1}> at chapter <{2}> in the requested version <{3}>, the last possible verse is <{4}>"), highverse, currBook, parts[0], StringUtils.join(selectedVersions, ","), StringUtils.join(verseLimit, ','))); finFlag = false; } } } Pattern pattern13 = Pattern.compile("\\.([1-9][0-9]{0,2})$"); Matcher matcher13 = pattern13.matcher(querie); int highverse = -1; while (matcher13.find()) { highverse = Integer.parseInt(matcher13.group(1)); } if (highverse != -1) { if (indexes.isValidVerse(highverse, Integer.parseInt(parts[0]), myidx, selectedVersions) == false) { int[] verseLimit = indexes.getVerseLimit(Integer.parseInt(parts[0]), myidx, selectedVersions); errorMessages.add(MessageFormat.format(__( "A verse in the query is out of bounds: there is no verse <{0}> in the book <{1}> at chapter <{2}> in the requested version <{3}>, the last possible verse is <{4}>"), highverse, currBook, parts[0], StringUtils.join(selectedVersions, ","), StringUtils.join(verseLimit, ','))); finFlag = false; } } } } else { //if there is no comma, it's either a single chapter or an extension of chapters with a dash //System.out.println("no comma found"); String[] parts = StringUtils.split(querie, "-"); //System.out.println(Arrays.toString(parts)); int highchapter = Integer.parseInt(parts[parts.length - 1]); if (indexes.isValidChapter(highchapter, myidx, selectedVersions) == false) { int[] chapterLimit = indexes.getChapterLimit(myidx, selectedVersions); errorMessages.add(MessageFormat.format(__( "A chapter in the query is out of bounds: there is no chapter <{0}> in the book <{1}> in the requested version <{2}>, the last possible chapter is <{3}>"), Integer.toString(highchapter), currBook, StringUtils.join(selectedVersions, ","), StringUtils.join(chapterLimit, ','))); finFlag = false; } } if (querie.contains("-")) { //RULE 7: If there are multiple dashes in a query, there cannot be more dashes than there are dots minus 1 int dashcount = StringUtils.countMatches(querie, "-"); int dotcount = StringUtils.countMatches(querie, "."); if (dashcount > 1) { if (dashcount - 1 > dotcount) { errorMessages.add(__( "There are multiple dashes in the query, but there are not enough dots. There can only be one more dash than dots.") + " <" + querie + ">"); finFlag = false; } } //RULE 8: Dash operators must be preceded and followed by a number from one to three digits, of which the first digit cannot be 0 Pattern pattern5 = Pattern.compile("([1-9][0-9]{0,2}\\-[1-9][0-9]{0,2})"); Matcher matcher5 = pattern5.matcher(querie); int count = 0; while (matcher5.find()) { count++; } if (count == 0 || count != StringUtils.countMatches(querie, "-")) { errorMessages.add(__( "A dash must be preceded and followed by 1 to 3 digits of which the first digit cannot be zero.") + " <" + querie + ">"); finFlag = false; } //RULE 9: If a comma construct follows a dash, there must also be a comma construct preceding the dash Pattern pattern6 = Pattern.compile("\\-([1-9][0-9]{0,2})\\,"); Matcher matcher6 = pattern6.matcher(querie); if (matcher6.find()) { Pattern pattern7 = Pattern.compile("\\,[1-9][0-9]{0,2}\\-"); Matcher matcher7 = pattern7.matcher(querie); if (matcher7.find() == false) { errorMessages.add(__( "If there is a chapter-verse construct following a dash, there must also be a chapter-verse construct preceding the same dash.") + " <" + querie + ">"); finFlag = false; } else { //RULE 10: Chapters before and after dashes must be sequential int chap1 = -1; int chap2 = -1; Pattern pattern8 = Pattern.compile("([1-9][0-9]{0,2})\\,[1-9][0-9]{0,2}\\-"); Matcher matcher8 = pattern8.matcher(querie); if (matcher8.find()) { chap1 = Integer.parseInt(matcher8.group(1)); } Pattern pattern9 = Pattern.compile("\\-([1-9][0-9]{0,2})\\,"); Matcher matcher9 = pattern9.matcher(querie); if (matcher9.find()) { chap2 = Integer.parseInt(matcher9.group(1)); } if (chap1 >= chap2) { errorMessages.add(MessageFormat.format(__( "Chapters must be consecutive. Instead the first chapter indicator <{0}> is greater than or equal to the second chapter indicator <{1}> in the expression <{2}>"), chap1, chap2, querie)); finFlag = false; } } } else { //if there are no comma constructs immediately following the dash //RULE 11: Verses (or chapters if applicable) around each of the dash operator(s) must be sequential Pattern pattern10 = Pattern.compile("([1-9][0-9]{0,2})\\-([1-9][0-9]{0,2})"); Matcher matcher10 = pattern10.matcher(querie); while (matcher10.find()) { int num1 = Integer.parseInt(matcher10.group(1)); int num2 = Integer.parseInt(matcher10.group(2)); if (num1 >= num2) { errorMessages.add(MessageFormat.format(__( "Verses (or chapters if applicable) around the dash operator must be consecutive. Instead <{0}> is greater than or equal to <{1}> in the expression <{2}>"), num1, num2, querie)); finFlag = false; } } } } } return finFlag; }
From source file:com.joliciel.talismane.tokeniser.filters.TokenRegexFilterImpl.java
Pattern getPattern() {
if (pattern == null) {
// we may need to replace WordLists by the list contents
String myRegex = this.regex;
if (LOG.isTraceEnabled()) {
LOG.trace("Regex: " + myRegex);
}//from w ww .j ava2s .co m
if (this.autoWordBoundaries) {
Boolean startsWithLetter = null;
for (int i = 0; i < myRegex.length() && startsWithLetter == null; i++) {
char c = myRegex.charAt(i);
if (c == '\\') {
i++;
c = myRegex.charAt(i);
if (c == 'd' || c == 'w') {
startsWithLetter = true;
} else if (c == 's' || c == 'W' || c == 'b' || c == 'B') {
startsWithLetter = false;
} else if (c == 'p') {
i += 2; // skip the open curly brackets
int closeCurlyBrackets = myRegex.indexOf('}', i);
int openParentheses = myRegex.indexOf('(', i);
int endIndex = closeCurlyBrackets;
if (openParentheses > 0 && openParentheses < closeCurlyBrackets)
endIndex = openParentheses;
if (endIndex > 0) {
String specialClass = myRegex.substring(i, endIndex);
if (specialClass.equals("WordList")) {
startsWithLetter = true;
}
}
}
break;
} else if (c == '[' || c == '(') {
// do nothing
} else if (Character.isLetter(c) || Character.isDigit(c)) {
startsWithLetter = true;
} else {
startsWithLetter = false;
}
}
Boolean endsWithLetter = null;
for (int i = myRegex.length() - 1; i >= 0 && endsWithLetter == null; i--) {
char c = myRegex.charAt(i);
char prevC = ' ';
if (i >= 1)
prevC = myRegex.charAt(i - 1);
if (prevC == '\\') {
if (c == 'd' || c == 'w') {
endsWithLetter = true;
} else if (c == 's' || c == 'W' || c == 'b' || c == 'B') {
endsWithLetter = false;
} else if (c == 'p') {
i += 2; // skip the open curly brackets
int closeCurlyBrackets = myRegex.indexOf('}', i);
int openParentheses = myRegex.indexOf('(', i);
int endIndex = closeCurlyBrackets;
if (openParentheses < closeCurlyBrackets)
endIndex = openParentheses;
if (endIndex > 0) {
String specialClass = myRegex.substring(i, endIndex);
if (specialClass.equals("WordList") || specialClass.equals("Alpha")
|| specialClass.equals("Lower") || specialClass.equals("Upper")
|| specialClass.equals("ASCII") || specialClass.equals("Digit")) {
startsWithLetter = true;
}
}
}
break;
} else if (c == ']' || c == ')' || c == '+') {
// do nothing
} else if (c == '}') {
int startIndex = myRegex.lastIndexOf('{') + 1;
int closeCurlyBrackets = myRegex.indexOf('}', startIndex);
int openParentheses = myRegex.indexOf('(', startIndex);
int endIndex = closeCurlyBrackets;
if (openParentheses > 0 && openParentheses < closeCurlyBrackets)
endIndex = openParentheses;
if (endIndex > 0) {
String specialClass = myRegex.substring(startIndex, endIndex);
if (specialClass.equals("WordList") || specialClass.equals("Alpha")
|| specialClass.equals("Lower") || specialClass.equals("Upper")
|| specialClass.equals("ASCII") || specialClass.equals("Digit")) {
endsWithLetter = true;
}
}
break;
} else if (Character.isLetter(c) || Character.isDigit(c)) {
endsWithLetter = true;
} else {
endsWithLetter = false;
}
}
if (startsWithLetter != null && startsWithLetter) {
myRegex = "\\b" + myRegex;
}
if (endsWithLetter != null && endsWithLetter) {
myRegex = myRegex + "\\b";
}
if (LOG.isTraceEnabled()) {
LOG.trace("After autoWordBoundaries: " + myRegex);
}
}
if (!this.caseSensitive || !this.diacriticSensitive) {
StringBuilder regexBuilder = new StringBuilder();
for (int i = 0; i < myRegex.length(); i++) {
char c = myRegex.charAt(i);
if (c == '\\') {
// escape - skip next
regexBuilder.append(c);
i++;
c = myRegex.charAt(i);
regexBuilder.append(c);
} else if (c == '[') {
// character group, don't change it
regexBuilder.append(c);
while (c != ']' && i < myRegex.length()) {
i++;
c = myRegex.charAt(i);
regexBuilder.append(c);
}
} else if (c == '{') {
// command, don't change it
regexBuilder.append(c);
while (c != '}' && i < myRegex.length()) {
i++;
c = myRegex.charAt(i);
regexBuilder.append(c);
}
} else if (Character.isLetter(c)) {
Set<String> chars = new TreeSet<String>();
chars.add("" + c);
char noAccent = diacriticPattern.matcher(Normalizer.normalize("" + c, Form.NFD))
.replaceAll("").charAt(0);
if (!this.caseSensitive) {
chars.add("" + Character.toUpperCase(c));
chars.add("" + Character.toLowerCase(c));
chars.add("" + Character.toUpperCase(noAccent));
}
if (!this.diacriticSensitive) {
chars.add("" + noAccent);
if (!this.caseSensitive) {
chars.add("" + Character.toLowerCase(noAccent));
}
}
if (chars.size() == 1) {
regexBuilder.append(c);
} else {
regexBuilder.append('[');
for (String oneChar : chars) {
regexBuilder.append(oneChar);
}
regexBuilder.append(']');
}
} else {
regexBuilder.append(c);
}
}
myRegex = regexBuilder.toString();
if (LOG.isTraceEnabled()) {
LOG.trace("After caseSensitive: " + myRegex);
}
}
Matcher matcher = wordListPattern.matcher(myRegex);
StringBuilder regexBuilder = new StringBuilder();
int lastIndex = 0;
while (matcher.find()) {
String[] params = matcher.group(1).split(",");
int start = matcher.start();
int end = matcher.end();
regexBuilder.append(myRegex.substring(lastIndex, start));
String wordListName = params[0];
boolean uppercaseOptional = false;
boolean diacriticsOptional = false;
boolean lowercaseOptional = false;
boolean firstParam = true;
for (String param : params) {
if (firstParam) {
/* word list name */ } else if (param.equals("diacriticsOptional"))
diacriticsOptional = true;
else if (param.equals("uppercaseOptional"))
uppercaseOptional = true;
else if (param.equals("lowercaseOptional"))
lowercaseOptional = true;
else
throw new TalismaneException(
"Unknown parameter in word list " + matcher.group(1) + ": " + param);
firstParam = false;
}
ExternalWordList wordList = externalResourceFinder.getExternalWordList(wordListName);
if (wordList == null)
throw new TalismaneException("Unknown word list: " + wordListName);
StringBuilder sb = new StringBuilder();
boolean firstWord = true;
for (String word : wordList.getWordList()) {
if (!firstWord)
sb.append("|");
word = Normalizer.normalize(word, Form.NFC);
if (uppercaseOptional || diacriticsOptional) {
String wordNoDiacritics = Normalizer.normalize(word, Form.NFD)
.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
String wordLowercase = word.toLowerCase(Locale.ENGLISH);
String wordLowercaseNoDiacritics = Normalizer.normalize(wordLowercase, Form.NFD)
.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
String wordUppercase = wordNoDiacritics.toUpperCase(Locale.ENGLISH);
boolean needsGrouping = false;
if (uppercaseOptional && !word.equals(wordLowercase))
needsGrouping = true;
if (diacriticsOptional && !word.equals(wordNoDiacritics))
needsGrouping = true;
if (lowercaseOptional && !word.equals(wordUppercase))
needsGrouping = true;
if (needsGrouping) {
for (int i = 0; i < word.length(); i++) {
char c = word.charAt(i);
boolean grouped = false;
if (uppercaseOptional && c != wordLowercase.charAt(i))
grouped = true;
if (diacriticsOptional && c != wordNoDiacritics.charAt(i))
grouped = true;
if (lowercaseOptional && c != wordUppercase.charAt(i))
grouped = true;
if (!grouped)
sb.append(c);
else {
sb.append("[");
String group = "" + c;
if (uppercaseOptional && group.indexOf(wordLowercase.charAt(i)) < 0)
group += (wordLowercase.charAt(i));
if (lowercaseOptional && group.indexOf(wordUppercase.charAt(i)) < 0)
group += (wordUppercase.charAt(i));
if (diacriticsOptional && group.indexOf(wordNoDiacritics.charAt(i)) < 0)
group += (wordNoDiacritics.charAt(i));
if (uppercaseOptional && diacriticsOptional
&& group.indexOf(wordLowercaseNoDiacritics.charAt(i)) < 0)
group += (wordLowercaseNoDiacritics.charAt(i));
sb.append(group);
sb.append("]");
} // does this letter need grouping?
} // next letter
} else {
sb.append(word);
} // any options activated?
} else {
sb.append(word);
}
firstWord = false;
} // next word in list
regexBuilder.append(sb.toString());
lastIndex = end;
} // next match
regexBuilder.append(myRegex.substring(lastIndex));
myRegex = regexBuilder.toString();
this.pattern = Pattern.compile(myRegex, Pattern.UNICODE_CHARACTER_CLASS);
}
return pattern;
}
From source file:com.screenslicer.core.util.Util.java
public static List<String> transformUrlStrings(List<String> urls, UrlTransform[] urlTransforms, boolean forExport) { List<String> newUrls = new ArrayList<String>(); if (urlTransforms != null && urlTransforms.length != 0 && urls != null) { for (String url : urls) { String newUrl = url;//from w w w. java 2 s. c o m for (int i = 0; urlTransforms != null && i < urlTransforms.length; i++) { if (!CommonUtil.isEmpty(urlTransforms[i].regex) && newUrl != null && urlTransforms[i] != null && ((forExport && urlTransforms[i].transformForExportOnly) || (!forExport && !urlTransforms[i].transformForExportOnly))) { Pattern pattern = Pattern.compile(urlTransforms[i].regex, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS); Matcher matcher = pattern.matcher(newUrl); if (matcher.find()) { if (urlTransforms[i].replaceAll) { if (urlTransforms[i].replaceAllRecursive) { String transformed = matcher.replaceAll(urlTransforms[i].replacement); String transformedRec = pattern.matcher(transformed) .replaceAll(urlTransforms[i].replacement); while (!transformed.equals(transformedRec)) { transformed = transformedRec; transformedRec = pattern.matcher(transformedRec) .replaceAll(urlTransforms[i].replacement); } newUrl = transformed; } else { newUrl = matcher.replaceAll(urlTransforms[i].replacement); } } else { newUrl = matcher.replaceFirst(urlTransforms[i].replacement); } if (!urlTransforms[i].multipleTransforms) { break; } } } } newUrls.add(newUrl); } } else { return urls; } return newUrls; }
From source file:no.kantega.publishing.common.ContentIdHelperImpl.java
@Override public void setServletContext(ServletContext servletContext) { CONTENT_URL_PATTERN = Pattern.compile( ContentPatterns.getPatternWithContextPath(servletContext.getContextPath()), Pattern.UNICODE_CHARACTER_CLASS); }
From source file:org.apache.nifi.processors.standard.EvaluateRegularExpression.java
int getCompileFlags(ProcessContext context) { int flags = (context.getProperty(UNIX_LINES).asBoolean() ? Pattern.UNIX_LINES : 0) | (context.getProperty(CASE_INSENSITIVE).asBoolean() ? Pattern.CASE_INSENSITIVE : 0) | (context.getProperty(COMMENTS).asBoolean() ? Pattern.COMMENTS : 0) | (context.getProperty(MULTILINE).asBoolean() ? Pattern.MULTILINE : 0) | (context.getProperty(LITERAL).asBoolean() ? Pattern.LITERAL : 0) | (context.getProperty(DOTALL).asBoolean() ? Pattern.DOTALL : 0) | (context.getProperty(UNICODE_CASE).asBoolean() ? Pattern.UNICODE_CASE : 0) | (context.getProperty(CANON_EQ).asBoolean() ? Pattern.CANON_EQ : 0) | (context.getProperty(UNICODE_CHARACTER_CLASS).asBoolean() ? Pattern.UNICODE_CHARACTER_CLASS : 0); return flags; }