Example usage for org.apache.commons.lang3 RegExUtils removePattern

Introduction

In this page you can find the example usage for org.apache.commons.lang3 RegExUtils removePattern.

Prototype

public static String removePattern(final String text, final String regex)

Source Link

Document

Removes each substring of the source String that matches the given regular expression using the DOTALL option.

This call is a null safe equivalent to:

text.replaceAll("(?s)" + regex, StringUtils.EMPTY)
Pattern.compile(regex, Pattern.DOTALL).matcher(text).replaceAll(StringUtils.EMPTY)

A null reference passed to this method is a no-op.

 StringUtils.removePattern(null, *)       = null StringUtils.removePattern("any", (String) null)   = "any" StringUtils.removePattern("A<__>\n<__>B", "<.*>")  = "AB" StringUtils.removePattern("ABCabc123", "[a-z]")    = "ABC123"

Usage

From source file:org.languagetool.rules.de.GermanSpellerRule.java

private boolean ignoreElative(String word) {
    if (StringUtils.startsWithAny(word, "bitter", "dunkel", "erz", "extra", "frh", "gemein", "hyper", "lau",
            "mega", "minder", "stock", "super", "tod", "ultra", "ur")) {
        String lastPart = RegExUtils.removePattern(word,
                "^(bitter|dunkel|erz|extra|frh|gemein|grund|hyper|lau|mega|minder|stock|super|tod|ultra|ur|voll)");
        return !isMisspelled(lastPart);
    }/*  ww  w  .  j ava 2 s. com*/
    return false;
}

From source file:org.languagetool.tagging.de.GermanTagger.java

public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens, boolean ignoreCase) throws IOException {
    initializeIfRequired();//from   w ww. ja  v a2 s .c  o m

    boolean firstWord = true;
    List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;

    String prevWord = null;
    for (String word : sentenceTokens) {
        List<AnalyzedToken> readings = new ArrayList<>();
        List<TaggedWord> taggerTokens = getWordTagger().tag(word);

        //Only first iteration. Consider ":" as a potential sentence start marker
        if ((firstWord || ":".equals(prevWord)) && taggerTokens.isEmpty() && ignoreCase) { // e.g. "Das" -> "das" at start of sentence
            taggerTokens = getWordTagger().tag(word.toLowerCase());
            firstWord = word.matches("^\\W?$");
        } else if (pos == 0 && ignoreCase) { // "Haben", "Sollen", "Knnen", "Gerade" etc. at start of sentence
            taggerTokens.addAll(getWordTagger().tag(word.toLowerCase()));
        } else if (pos > 1 && taggerTokens.isEmpty() && ignoreCase) {
            int idx = sentenceTokens.indexOf(word);
            // add lowercase token readings to words at start of direct speech
            if (idx > 2 && sentenceTokens.get(idx - 1).contentEquals("")
                    && sentenceTokens.get(idx - 3).contentEquals(":")) {
                taggerTokens.addAll(getWordTagger().tag(word.toLowerCase()));
            }
        }

        if (taggerTokens.size() > 0) { //Word known, just add analyzed token to readings
            readings.addAll(getAnalyzedTokens(taggerTokens, word));
        } else { // Word not known, try to decompose it and use the last part for POS tagging:
            if (!StringTools.isEmpty(word.trim())) {
                List<String> compoundParts = compoundTokenizer.tokenize(word);
                if (compoundParts.size() <= 1) {//Could not find simple compound parts
                    // Recognize alternative imperative forms (e.g., "Geh bitte!" in addition to "Gehe bitte!")
                    List<AnalyzedToken> imperativeFormList = getImperativeForm(word, sentenceTokens, pos);
                    List<AnalyzedToken> substantivatedFormsList = getSubstantivatedForms(word, sentenceTokens,
                            pos);
                    if (imperativeFormList.size() > 0) {
                        readings.addAll(imperativeFormList);
                    } else if (substantivatedFormsList.size() > 0) {
                        readings.addAll(substantivatedFormsList);
                    } else {
                        if (StringUtils.startsWithAny(word, "bitter", "dunkel", "erz", "extra", "frh",
                                "gemein", "hyper", "lau", "mega", "minder", "stock", "super", "tod", "ultra",
                                "ur")) {
                            String lastPart = RegExUtils.removePattern(word,
                                    "^(bitter|dunkel|erz|extra|frh|gemein|grund|hyper|lau|mega|minder|stock|super|tod|ultra|ur|voll)");
                            if (lastPart.length() > 1) {
                                String firstPart = StringUtils.removeEnd(word, lastPart);
                                List<TaggedWord> taggedWords = getWordTagger().tag(lastPart);
                                for (TaggedWord taggedWord : taggedWords) {
                                    readings.add(new AnalyzedToken(word, taggedWord.getPosTag(),
                                            firstPart + taggedWord.getLemma()));
                                }
                            }
                        }
                        //Separate dash-linked words
                        //Only check single word tokens and skip words containing numbers because it's unpredictable
                        if (word.split(" ").length == 1 && !Character.isDigit(word.charAt(0))) {
                            String wordOrig = word;
                            word = sanitizeWord(word);
                            String wordStem = wordOrig.substring(0, wordOrig.length() - word.length());

                            //Tokenize, start word uppercase if it's a result of splitting
                            List<String> compoundedWord = compoundTokenizer.tokenize(word);
                            if (compoundedWord.size() > 1) {
                                word = StringTools
                                        .uppercaseFirstChar(compoundedWord.get(compoundedWord.size() - 1));
                            } else {
                                word = compoundedWord.get(compoundedWord.size() - 1);
                            }

                            List<TaggedWord> linkedTaggerTokens = addStem(getWordTagger().tag(word), wordStem); //Try to analyze the last part found

                            //Some words that are linked with a dash ('-') will be written in uppercase, even adjectives
                            if (wordOrig.contains("-") && linkedTaggerTokens.isEmpty()
                                    && matchesUppercaseAdjective(word)) {
                                word = StringTools.lowercaseFirstChar(word);
                                linkedTaggerTokens = getWordTagger().tag(word);
                            }

                            word = wordOrig;

                            boolean wordStartsUppercase = StringTools.startsWithUppercase(word);
                            if (linkedTaggerTokens.isEmpty()) {
                                readings.add(getNoInfoToken(word));
                            } else {
                                if (wordStartsUppercase) { //Choose between uppercase/lowercase Lemma
                                    readings.addAll(getAnalyzedTokens(linkedTaggerTokens, word));
                                } else {
                                    readings.addAll(
                                            getAnalyzedTokens(linkedTaggerTokens, word, compoundedWord));
                                }
                            }
                        } else {
                            readings.add(getNoInfoToken(word));
                        }
                    }
                } else {
                    // last part governs a word's POS:
                    String lastPart = compoundParts.get(compoundParts.size() - 1);
                    if (StringTools.startsWithUppercase(word)) {
                        lastPart = StringTools.uppercaseFirstChar(lastPart);
                    }
                    List<TaggedWord> partTaggerTokens = getWordTagger().tag(lastPart);
                    if (partTaggerTokens.isEmpty()) {
                        readings.add(getNoInfoToken(word));
                    } else {
                        readings.addAll(getAnalyzedTokens(partTaggerTokens, word, compoundParts));
                    }
                }
            } else {
                readings.add(getNoInfoToken(word));
            }
        }
        tokenReadings.add(new AnalyzedTokenReadings(readings.toArray(new AnalyzedToken[readings.size()]), pos));
        pos += word.length();
        prevWord = word;
    }
    return tokenReadings;
}