List of usage examples for org.apache.commons.lang3 RegExUtils removePattern
public static String removePattern(final String text, final String regex)
Removes each substring of the source String that matches the given regular expression using the DOTALL option.
This call is a null safe equivalent to:A null reference passed to this method is a no-op.
StringUtils.removePattern(null, *) = null StringUtils.removePattern("any", (String) null) = "any" StringUtils.removePattern("A<__>\n<__>B", "<.*>") = "AB" StringUtils.removePattern("ABCabc123", "[a-z]") = "ABC123"
From source file:org.languagetool.rules.de.GermanSpellerRule.java
private boolean ignoreElative(String word) { if (StringUtils.startsWithAny(word, "bitter", "dunkel", "erz", "extra", "frh", "gemein", "hyper", "lau", "mega", "minder", "stock", "super", "tod", "ultra", "ur")) { String lastPart = RegExUtils.removePattern(word, "^(bitter|dunkel|erz|extra|frh|gemein|grund|hyper|lau|mega|minder|stock|super|tod|ultra|ur|voll)"); return !isMisspelled(lastPart); }/* ww w . j ava 2 s. com*/ return false; }
From source file:org.languagetool.tagging.de.GermanTagger.java
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens, boolean ignoreCase) throws IOException { initializeIfRequired();//from w ww. ja v a2 s .c o m boolean firstWord = true; List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>(); int pos = 0; String prevWord = null; for (String word : sentenceTokens) { List<AnalyzedToken> readings = new ArrayList<>(); List<TaggedWord> taggerTokens = getWordTagger().tag(word); //Only first iteration. Consider ":" as a potential sentence start marker if ((firstWord || ":".equals(prevWord)) && taggerTokens.isEmpty() && ignoreCase) { // e.g. "Das" -> "das" at start of sentence taggerTokens = getWordTagger().tag(word.toLowerCase()); firstWord = word.matches("^\\W?$"); } else if (pos == 0 && ignoreCase) { // "Haben", "Sollen", "Knnen", "Gerade" etc. at start of sentence taggerTokens.addAll(getWordTagger().tag(word.toLowerCase())); } else if (pos > 1 && taggerTokens.isEmpty() && ignoreCase) { int idx = sentenceTokens.indexOf(word); // add lowercase token readings to words at start of direct speech if (idx > 2 && sentenceTokens.get(idx - 1).contentEquals("") && sentenceTokens.get(idx - 3).contentEquals(":")) { taggerTokens.addAll(getWordTagger().tag(word.toLowerCase())); } } if (taggerTokens.size() > 0) { //Word known, just add analyzed token to readings readings.addAll(getAnalyzedTokens(taggerTokens, word)); } else { // Word not known, try to decompose it and use the last part for POS tagging: if (!StringTools.isEmpty(word.trim())) { List<String> compoundParts = compoundTokenizer.tokenize(word); if (compoundParts.size() <= 1) {//Could not find simple compound parts // Recognize alternative imperative forms (e.g., "Geh bitte!" in addition to "Gehe bitte!") List<AnalyzedToken> imperativeFormList = getImperativeForm(word, sentenceTokens, pos); List<AnalyzedToken> substantivatedFormsList = getSubstantivatedForms(word, sentenceTokens, pos); if (imperativeFormList.size() > 0) { readings.addAll(imperativeFormList); } else if (substantivatedFormsList.size() > 0) { readings.addAll(substantivatedFormsList); } else { if (StringUtils.startsWithAny(word, "bitter", "dunkel", "erz", "extra", "frh", "gemein", "hyper", "lau", "mega", "minder", "stock", "super", "tod", "ultra", "ur")) { String lastPart = RegExUtils.removePattern(word, "^(bitter|dunkel|erz|extra|frh|gemein|grund|hyper|lau|mega|minder|stock|super|tod|ultra|ur|voll)"); if (lastPart.length() > 1) { String firstPart = StringUtils.removeEnd(word, lastPart); List<TaggedWord> taggedWords = getWordTagger().tag(lastPart); for (TaggedWord taggedWord : taggedWords) { readings.add(new AnalyzedToken(word, taggedWord.getPosTag(), firstPart + taggedWord.getLemma())); } } } //Separate dash-linked words //Only check single word tokens and skip words containing numbers because it's unpredictable if (word.split(" ").length == 1 && !Character.isDigit(word.charAt(0))) { String wordOrig = word; word = sanitizeWord(word); String wordStem = wordOrig.substring(0, wordOrig.length() - word.length()); //Tokenize, start word uppercase if it's a result of splitting List<String> compoundedWord = compoundTokenizer.tokenize(word); if (compoundedWord.size() > 1) { word = StringTools .uppercaseFirstChar(compoundedWord.get(compoundedWord.size() - 1)); } else { word = compoundedWord.get(compoundedWord.size() - 1); } List<TaggedWord> linkedTaggerTokens = addStem(getWordTagger().tag(word), wordStem); //Try to analyze the last part found //Some words that are linked with a dash ('-') will be written in uppercase, even adjectives if (wordOrig.contains("-") && linkedTaggerTokens.isEmpty() && matchesUppercaseAdjective(word)) { word = StringTools.lowercaseFirstChar(word); linkedTaggerTokens = getWordTagger().tag(word); } word = wordOrig; boolean wordStartsUppercase = StringTools.startsWithUppercase(word); if (linkedTaggerTokens.isEmpty()) { readings.add(getNoInfoToken(word)); } else { if (wordStartsUppercase) { //Choose between uppercase/lowercase Lemma readings.addAll(getAnalyzedTokens(linkedTaggerTokens, word)); } else { readings.addAll( getAnalyzedTokens(linkedTaggerTokens, word, compoundedWord)); } } } else { readings.add(getNoInfoToken(word)); } } } else { // last part governs a word's POS: String lastPart = compoundParts.get(compoundParts.size() - 1); if (StringTools.startsWithUppercase(word)) { lastPart = StringTools.uppercaseFirstChar(lastPart); } List<TaggedWord> partTaggerTokens = getWordTagger().tag(lastPart); if (partTaggerTokens.isEmpty()) { readings.add(getNoInfoToken(word)); } else { readings.addAll(getAnalyzedTokens(partTaggerTokens, word, compoundParts)); } } } else { readings.add(getNoInfoToken(word)); } } tokenReadings.add(new AnalyzedTokenReadings(readings.toArray(new AnalyzedToken[readings.size()]), pos)); pos += word.length(); prevWord = word; } return tokenReadings; }