Example usage for org.apache.commons.codec.language ColognePhonetic ColognePhonetic

List of usage examples for org.apache.commons.codec.language ColognePhonetic ColognePhonetic

Introduction

In this page you can find the example usage for org.apache.commons.codec.language ColognePhonetic ColognePhonetic.

Prototype

ColognePhonetic

Source Link

Usage

From source file:dkpro.similarity.algorithms.sound.ColognePhoneticComparator.java

public ColognePhoneticComparator() {
    encoder = new ColognePhonetic();
}

From source file:de.tudarmstadt.ukp.dkpro.core.commonscodec.ColognePhoneticTranscriptor.java

public ColognePhoneticTranscriptor() {
    this.encoder = new ColognePhonetic();
}

From source file:at.itbh.bev.index.TextAnalyzer.java

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.addressLineStemmingPattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true);
    filter = new PhoneticFilter(filter, new ColognePhonetic(), true);
    filter = new NGramTokenFilter(filter, 2, 6);
    return new TokenStreamComponents(source, filter);
}

From source file:com.jaeksoft.searchlib.analysis.filter.PhoneticFilter.java

@Override
public TokenStream create(TokenStream tokenStream) {
    if (BEIDER_MORSE.equals(codec))
        return new BeiderMorseTokenFilter(tokenStream, new EncoderKey(ruleType, maxPhonemes));
    if (COLOGNE_PHONETIC.equals(codec))
        return new EncoderTokenFilter(tokenStream, new ColognePhonetic());
    if (SOUNDEX.equals(codec))
        return new EncoderTokenFilter(tokenStream, new Soundex());
    if (REFINED_SOUNDEX.equals(codec))
        return new EncoderTokenFilter(tokenStream, new RefinedSoundex());
    if (METAPHONE.equals(codec))
        return new EncoderTokenFilter(tokenStream, new Metaphone());
    if (CAVERPHONE1.equals(codec))
        return new EncoderTokenFilter(tokenStream, new Caverphone1());
    if (CAVERPHONE2.equals(codec))
        return new EncoderTokenFilter(tokenStream, new Caverphone2());
    return null;//from w w w.  j  ava 2  s  .co m
}

From source file:com.example.PhoneticTokenFilterFactory.java

@Inject
public PhoneticTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name,
        @Assisted Settings settings) {//from w w  w . j  av  a2s  .  c  o m
    super(index, indexSettingsService.getSettings(), name, settings);
    this.languageset = null;
    this.nametype = null;
    this.ruletype = null;
    this.maxcodelength = 0;
    this.replace = settings.getAsBoolean("replace", true);
    // weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
    String encodername = settings.get("encoder", "metaphone");
    if ("metaphone".equalsIgnoreCase(encodername)) {
        this.encoder = new Metaphone();
    } else if ("soundex".equalsIgnoreCase(encodername)) {
        this.encoder = new Soundex();
    } else if ("caverphone1".equalsIgnoreCase(encodername)) {
        this.encoder = new Caverphone1();
    } else if ("caverphone2".equalsIgnoreCase(encodername)) {
        this.encoder = new Caverphone2();
    } else if ("caverphone".equalsIgnoreCase(encodername)) {
        this.encoder = new Caverphone2();
    } else if ("refined_soundex".equalsIgnoreCase(encodername)
            || "refinedSoundex".equalsIgnoreCase(encodername)) {
        this.encoder = new RefinedSoundex();
    } else if ("cologne".equalsIgnoreCase(encodername)) {
        this.encoder = new ColognePhonetic();
    } else if ("double_metaphone".equalsIgnoreCase(encodername)
            || "doubleMetaphone".equalsIgnoreCase(encodername)) {
        this.encoder = null;
        this.maxcodelength = settings.getAsInt("max_code_len", 4);
    } else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername)
            || "beidermorse".equalsIgnoreCase(encodername)) {
        this.encoder = null;
        this.languageset = settings.getAsArray("languageset");
        String ruleType = settings.get("rule_type", "approx");
        if ("approx".equalsIgnoreCase(ruleType)) {
            ruletype = RuleType.APPROX;
        } else if ("exact".equalsIgnoreCase(ruleType)) {
            ruletype = RuleType.EXACT;
        } else {
            throw new IllegalArgumentException(
                    "No matching rule type [" + ruleType + "] for beider morse encoder");
        }
        String nameType = settings.get("name_type", "generic");
        if ("GENERIC".equalsIgnoreCase(nameType)) {
            nametype = NameType.GENERIC;
        } else if ("ASHKENAZI".equalsIgnoreCase(nameType)) {
            nametype = NameType.ASHKENAZI;
        } else if ("SEPHARDIC".equalsIgnoreCase(nameType)) {
            nametype = NameType.SEPHARDIC;
        }
    } else if ("koelnerphonetik".equalsIgnoreCase(encodername)) {
        this.encoder = new KoelnerPhonetik();
    } else if ("haasephonetik".equalsIgnoreCase(encodername)) {
        this.encoder = new HaasePhonetik();
    } else if ("nysiis".equalsIgnoreCase(encodername)) {
        this.encoder = new Nysiis();
    } else if ("daitch_mokotoff".equalsIgnoreCase(encodername)) {
        this.encoder = new DaitchMokotoffSoundex();
    } else {
        throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
    }
}

From source file:de.micromata.genome.util.matcher.norm.StringNormalizeUtils.java

/**
 * normalize a string.//from   ww  w.  j  a  v  a2 s. c om
 * 
 * @param text the text
 * @param flags a collection of characters. See public static finals in this class.
 * @return the string
 */
public static String normalize(String text, String flags) {
    if (text == null) {
        return text;
    }
    if (flags.indexOf(REPLACEUMLAUTS) != -1) {
        text = deAccent(text); //replaceUmlauts(text);
    }
    if (flags.indexOf(UPPERCASE) != -1) {
        text = text.toUpperCase();
    }
    if (flags.indexOf(NOWHITESPACE) != -1) {
        text = NON_WHITESPACE_PATTERN.get().matcher(text).replaceAll("");
    }
    if (flags.indexOf(ASCIILETTERNUMBERSONLY) != -1) {
        text = NON_ALPHANUM_PATTERN.get().matcher(text).replaceAll("");
    }
    if (flags.indexOf(SOUNDEX) != -1) {
        text = Soundex.US_ENGLISH.encode(text);
    }
    if (flags.indexOf(COLOGNE) != -1) {
        text = new ColognePhonetic().encode(text);
    }
    return text;
}

From source file:at.jps.sanction.core.util.TokenTool.java

public static float compareCheckColognePhonetic(final String text1, final String text2, final boolean fuzzy,
        final int minlen, final double fuzzyValue) {

    final ColognePhonetic encoder = new ColognePhonetic(); // TODO: in reallife
    // make/*  w  w  w  . ja  v  a 2s. c o m*/
    // this go away !!

    return (compareCheck(encoder.colognePhonetic(text1), encoder.colognePhonetic(text2), fuzzy, minlen,
            fuzzyValue));

}

From source file:io.klerch.alexa.tellask.model.AlexaInput.java

/**
 * Checks if a slot is contained in the intent request and has a value which is a
 * phonetic sibling of the string given to this method. Cologne phonetic algorithm
 * is optimized for German language and in this case is used to match slot value with
 * value given to this method.//from  w ww.  ja  v  a  2  s  . co m
 * @param slotName name of the slot to look after
 * @param value the value
 * @return True, if slot value and given value are phonetically equal with Cologne phonetic algorithm
 */
public boolean hasSlotIsCologneEqual(final String slotName, final String value) {
    final String slotValue = getSlotValue(slotName);
    return hasSlotNotBlank(slotName) && value != null && new ColognePhonetic().isEncodeEqual(slotValue, value);
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.util.NGramUtils.java

public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, int minN, int maxN)
        throws TextClassificationException {
    StringEncoder encoder;//w  w  w .  java 2 s.  c o  m
    String languageCode = jcas.getDocumentLanguage();

    if (languageCode.equals("en")) {
        encoder = new Soundex();
    } else if (languageCode.equals("de")) {
        encoder = new ColognePhonetic();
    } else {
        throw new TextClassificationException(
                "Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
    }

    FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
    for (Sentence s : select(jcas, Sentence.class)) {
        List<String> phoneticStrings = new ArrayList<String>();
        for (Token t : JCasUtil.selectCovered(jcas, Token.class, s)) {
            try {
                phoneticStrings.add(encoder.encode(t.getCoveredText()));
            } catch (EncoderException e) {
                throw new TextClassificationException(e);
            }
        }
        String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);

        for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
            phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));

        }
    }
    return phoneticNgrams;
}

From source file:org.dkpro.tc.features.ngram.util.NGramUtils.java

public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN,
        int maxN) throws TextClassificationException {
    StringEncoder encoder;/*  w  w  w.j  a v a 2s  . c  o  m*/
    String languageCode = jcas.getDocumentLanguage();

    if (languageCode.equals("en")) {
        encoder = new Soundex();
    } else if (languageCode.equals("de")) {
        encoder = new ColognePhonetic();
    } else {
        throw new TextClassificationException(
                "Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
    }

    FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, target)) {
        List<String> phoneticStrings = new ArrayList<String>();
        for (Token t : JCasUtil.selectCovered(jcas, Token.class, s)) {
            try {
                phoneticStrings.add(encoder.encode(t.getCoveredText()));
            } catch (EncoderException e) {
                throw new TextClassificationException(e);
            }
        }
        String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);

        for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
            phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));

        }
    }
    return phoneticNgrams;
}