List of usage examples for org.apache.commons.codec StringEncoder encode
String encode(String pString) throws EncoderException;
From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.util.NGramUtils.java
public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, int minN, int maxN) throws TextClassificationException { StringEncoder encoder; String languageCode = jcas.getDocumentLanguage(); if (languageCode.equals("en")) { encoder = new Soundex(); } else if (languageCode.equals("de")) { encoder = new ColognePhonetic(); } else {//from ww w. ja va 2 s . co m throw new TextClassificationException( "Language code '" + languageCode + "' not supported by phonetic ngrams FE."); } FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>(); for (Sentence s : select(jcas, Sentence.class)) { List<String> phoneticStrings = new ArrayList<String>(); for (Token t : JCasUtil.selectCovered(jcas, Token.class, s)) { try { phoneticStrings.add(encoder.encode(t.getCoveredText())); } catch (EncoderException e) { throw new TextClassificationException(e); } } String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]); for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) { phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } } return phoneticNgrams; }
From source file:com.github.jinahya.codec.commons.RareStringEncoderProxyTest.java
@Test public void testAsStringEncoder() throws EncoderException { final StringEncoder encoder = (StringEncoder) RareStringEncoderProxy.newInstance(); try {//from w w w .j av a2s. c o m encoder.encode((String) null); Assert.fail("passed: encode((String) null)"); } catch (final NullPointerException npe) { // expected } final String expected = ""; final String actual = encoder.encode(expected); Assert.assertEquals(actual, expected); }
From source file:net.yacy.cora.language.phonetic.Soundex.java
/** * Encodes the Strings and returns the number of characters in the two * encoded Strings that are the same./*from ww w. j a v a 2 s . c om*/ * <ul> * <li>For Soundex, this return value ranges from 0 through 4: 0 indicates * little or no similarity, and 4 indicates strong similarity or identical * values.</li> * <li>For refined Soundex, the return value can be greater than 4.</li> * </ul> * * @param encoder * The encoder to use to encode the Strings. * @param s1 * A String that will be encoded and compared. * @param s2 * A String that will be encoded and compared. * @return The number of characters in the two Soundex encoded Strings that * are the same. * * @see #differenceEncoded(String,String) * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> * MS T-SQL DIFFERENCE</a> * * @throws EncoderException * if an error occurs encoding one of the strings */ static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException { return differenceEncoded(encoder.encode(s1), encoder.encode(s2)); }
From source file:org.apache.abdera2.common.text.Codec.java
private String _encode(String value, String charset) { if (value == null) return null; try {/* w w w . j a v a 2s . com*/ StringEncoder e = null; switch (this) { case Q: e = new QCodec(charset); break; case B: e = new BCodec(charset); break; default: e = new StarCodec(charset); break; } return e.encode(value); } catch (Exception e) { return value; } }
From source file:org.dkpro.tc.features.ngram.util.NGramUtils.java
public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException { StringEncoder encoder; String languageCode = jcas.getDocumentLanguage(); if (languageCode.equals("en")) { encoder = new Soundex(); } else if (languageCode.equals("de")) { encoder = new ColognePhonetic(); } else {//w ww . j a va 2 s. c o m throw new TextClassificationException( "Language code '" + languageCode + "' not supported by phonetic ngrams FE."); } FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>(); for (Sentence s : selectCovered(jcas, Sentence.class, target)) { List<String> phoneticStrings = new ArrayList<String>(); for (Token t : JCasUtil.selectCovered(jcas, Token.class, s)) { try { phoneticStrings.add(encoder.encode(t.getCoveredText())); } catch (EncoderException e) { throw new TextClassificationException(e); } } String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]); for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) { phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } } return phoneticNgrams; }