List of usage examples for org.apache.commons.codec.language ColognePhonetic ColognePhonetic
ColognePhonetic
From source file:dkpro.similarity.algorithms.sound.ColognePhoneticComparator.java
public ColognePhoneticComparator() { encoder = new ColognePhonetic(); }
From source file:de.tudarmstadt.ukp.dkpro.core.commonscodec.ColognePhoneticTranscriptor.java
public ColognePhoneticTranscriptor() { this.encoder = new ColognePhonetic(); }
From source file:at.itbh.bev.index.TextAnalyzer.java
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new LowerCaseFilter(source); filter = new PatternReplaceFilter(filter, RegexPatternCollection.addressLineStemmingPattern, "", true); filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true); filter = new PhoneticFilter(filter, new ColognePhonetic(), true); filter = new NGramTokenFilter(filter, 2, 6); return new TokenStreamComponents(source, filter); }
From source file:com.jaeksoft.searchlib.analysis.filter.PhoneticFilter.java
@Override public TokenStream create(TokenStream tokenStream) { if (BEIDER_MORSE.equals(codec)) return new BeiderMorseTokenFilter(tokenStream, new EncoderKey(ruleType, maxPhonemes)); if (COLOGNE_PHONETIC.equals(codec)) return new EncoderTokenFilter(tokenStream, new ColognePhonetic()); if (SOUNDEX.equals(codec)) return new EncoderTokenFilter(tokenStream, new Soundex()); if (REFINED_SOUNDEX.equals(codec)) return new EncoderTokenFilter(tokenStream, new RefinedSoundex()); if (METAPHONE.equals(codec)) return new EncoderTokenFilter(tokenStream, new Metaphone()); if (CAVERPHONE1.equals(codec)) return new EncoderTokenFilter(tokenStream, new Caverphone1()); if (CAVERPHONE2.equals(codec)) return new EncoderTokenFilter(tokenStream, new Caverphone2()); return null;//from w w w. j ava 2 s .co m }
From source file:com.example.PhoneticTokenFilterFactory.java
@Inject public PhoneticTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) {//from w w w . j av a2s . c o m super(index, indexSettingsService.getSettings(), name, settings); this.languageset = null; this.nametype = null; this.ruletype = null; this.maxcodelength = 0; this.replace = settings.getAsBoolean("replace", true); // weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default String encodername = settings.get("encoder", "metaphone"); if ("metaphone".equalsIgnoreCase(encodername)) { this.encoder = new Metaphone(); } else if ("soundex".equalsIgnoreCase(encodername)) { this.encoder = new Soundex(); } else if ("caverphone1".equalsIgnoreCase(encodername)) { this.encoder = new Caverphone1(); } else if ("caverphone2".equalsIgnoreCase(encodername)) { this.encoder = new Caverphone2(); } else if ("caverphone".equalsIgnoreCase(encodername)) { this.encoder = new Caverphone2(); } else if ("refined_soundex".equalsIgnoreCase(encodername) || "refinedSoundex".equalsIgnoreCase(encodername)) { this.encoder = new RefinedSoundex(); } else if ("cologne".equalsIgnoreCase(encodername)) { this.encoder = new ColognePhonetic(); } else if ("double_metaphone".equalsIgnoreCase(encodername) || "doubleMetaphone".equalsIgnoreCase(encodername)) { this.encoder = null; this.maxcodelength = settings.getAsInt("max_code_len", 4); } else if ("bm".equalsIgnoreCase(encodername) || "beider_morse".equalsIgnoreCase(encodername) || "beidermorse".equalsIgnoreCase(encodername)) { this.encoder = null; this.languageset = settings.getAsArray("languageset"); String ruleType = settings.get("rule_type", "approx"); if ("approx".equalsIgnoreCase(ruleType)) { ruletype = RuleType.APPROX; } else if ("exact".equalsIgnoreCase(ruleType)) { ruletype = RuleType.EXACT; } else { throw new IllegalArgumentException( "No matching rule type [" + ruleType + "] for beider morse encoder"); } String nameType = settings.get("name_type", "generic"); if ("GENERIC".equalsIgnoreCase(nameType)) { nametype = NameType.GENERIC; } else if ("ASHKENAZI".equalsIgnoreCase(nameType)) { nametype = NameType.ASHKENAZI; } else if ("SEPHARDIC".equalsIgnoreCase(nameType)) { nametype = NameType.SEPHARDIC; } } else if ("koelnerphonetik".equalsIgnoreCase(encodername)) { this.encoder = new KoelnerPhonetik(); } else if ("haasephonetik".equalsIgnoreCase(encodername)) { this.encoder = new HaasePhonetik(); } else if ("nysiis".equalsIgnoreCase(encodername)) { this.encoder = new Nysiis(); } else if ("daitch_mokotoff".equalsIgnoreCase(encodername)) { this.encoder = new DaitchMokotoffSoundex(); } else { throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter"); } }
From source file:de.micromata.genome.util.matcher.norm.StringNormalizeUtils.java
/** * normalize a string.//from ww w. j a v a2 s. c om * * @param text the text * @param flags a collection of characters. See public static finals in this class. * @return the string */ public static String normalize(String text, String flags) { if (text == null) { return text; } if (flags.indexOf(REPLACEUMLAUTS) != -1) { text = deAccent(text); //replaceUmlauts(text); } if (flags.indexOf(UPPERCASE) != -1) { text = text.toUpperCase(); } if (flags.indexOf(NOWHITESPACE) != -1) { text = NON_WHITESPACE_PATTERN.get().matcher(text).replaceAll(""); } if (flags.indexOf(ASCIILETTERNUMBERSONLY) != -1) { text = NON_ALPHANUM_PATTERN.get().matcher(text).replaceAll(""); } if (flags.indexOf(SOUNDEX) != -1) { text = Soundex.US_ENGLISH.encode(text); } if (flags.indexOf(COLOGNE) != -1) { text = new ColognePhonetic().encode(text); } return text; }
From source file:at.jps.sanction.core.util.TokenTool.java
public static float compareCheckColognePhonetic(final String text1, final String text2, final boolean fuzzy, final int minlen, final double fuzzyValue) { final ColognePhonetic encoder = new ColognePhonetic(); // TODO: in reallife // make/* w w w . ja v a 2s. c o m*/ // this go away !! return (compareCheck(encoder.colognePhonetic(text1), encoder.colognePhonetic(text2), fuzzy, minlen, fuzzyValue)); }
From source file:io.klerch.alexa.tellask.model.AlexaInput.java
/** * Checks if a slot is contained in the intent request and has a value which is a * phonetic sibling of the string given to this method. Cologne phonetic algorithm * is optimized for German language and in this case is used to match slot value with * value given to this method.//from w ww. ja v a 2 s . co m * @param slotName name of the slot to look after * @param value the value * @return True, if slot value and given value are phonetically equal with Cologne phonetic algorithm */ public boolean hasSlotIsCologneEqual(final String slotName, final String value) { final String slotValue = getSlotValue(slotName); return hasSlotNotBlank(slotName) && value != null && new ColognePhonetic().isEncodeEqual(slotValue, value); }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.util.NGramUtils.java
public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, int minN, int maxN) throws TextClassificationException { StringEncoder encoder;//w w w . java 2 s. c o m String languageCode = jcas.getDocumentLanguage(); if (languageCode.equals("en")) { encoder = new Soundex(); } else if (languageCode.equals("de")) { encoder = new ColognePhonetic(); } else { throw new TextClassificationException( "Language code '" + languageCode + "' not supported by phonetic ngrams FE."); } FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>(); for (Sentence s : select(jcas, Sentence.class)) { List<String> phoneticStrings = new ArrayList<String>(); for (Token t : JCasUtil.selectCovered(jcas, Token.class, s)) { try { phoneticStrings.add(encoder.encode(t.getCoveredText())); } catch (EncoderException e) { throw new TextClassificationException(e); } } String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]); for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) { phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } } return phoneticNgrams; }
From source file:org.dkpro.tc.features.ngram.util.NGramUtils.java
public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException { StringEncoder encoder;/* w w w.j a v a 2s . c o m*/ String languageCode = jcas.getDocumentLanguage(); if (languageCode.equals("en")) { encoder = new Soundex(); } else if (languageCode.equals("de")) { encoder = new ColognePhonetic(); } else { throw new TextClassificationException( "Language code '" + languageCode + "' not supported by phonetic ngrams FE."); } FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>(); for (Sentence s : selectCovered(jcas, Sentence.class, target)) { List<String> phoneticStrings = new ArrayList<String>(); for (Token t : JCasUtil.selectCovered(jcas, Token.class, s)) { try { phoneticStrings.add(encoder.encode(t.getCoveredText())); } catch (EncoderException e) { throw new TextClassificationException(e); } } String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]); for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) { phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } } return phoneticNgrams; }