Example usage for java.text Normalizer normalize

Introduction

In this page you can find the example usage for java.text Normalizer normalize.

Prototype

public static String normalize(CharSequence src, Form form)

Source Link

Document

Normalize a sequence of char values.

Usage

From source file:nl.utwente.bigdata.bolts.NormalizerBolt.java

@Override
public void execute(Tuple tuple, BasicOutputCollector collector) {
    Status tweet;// ww w  . j a  v a  2  s  . c  o m
    tweet = (Status) tuple.getValueByField("tweet");

    // from: http://stackoverflow.com/questions/1008802/converting-symbols-accent-letters-to-english-alphabet
    Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
    String nfdNormalizedString = "";
    nfdNormalizedString = Normalizer.normalize(tweet.getText(), Normalizer.Form.NFD);

    String normalizedTweet = (String) pattern.matcher(nfdNormalizedString.toLowerCase()).replaceAll("")
            .replace("\n", "").replace("\r", "");
    // Also remove prefixed with rt
    if (!normalizedTweet.startsWith("rt")) {
        collector.emit(new Values(tweet, normalizedTweet, tweet.getLang()));
    }
}

From source file:org.uiautomation.ios.server.utils.PlistFileUtils.java

/**
 * load the content of the file to a JSON object
 * //w  w  w  .  j ava2 s.  c o  m
 * @param from
 * @return
 * @throws Exception
 */
private JSONObject readJSONFile(File from) throws Exception {
    FileInputStream is = new FileInputStream(from);
    StringWriter writer = new StringWriter();
    IOUtils.copy(is, writer, "UTF-8");
    String content = writer.toString();
    content = Normalizer.normalize(content, LanguageDictionary.norme);
    return new JSONObject(content);
}

From source file:com.geecko.QuickLyric.lyrics.Genius.java

@Reflection
public static Lyrics fromMetaData(String originalArtist, String originalTitle) {
    String urlArtist = Normalizer.normalize(originalArtist, Normalizer.Form.NFD)
            .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    String urlTitle = Normalizer.normalize(originalTitle, Normalizer.Form.NFD)
            .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    urlArtist = urlArtist.replaceAll("[^a-zA-Z0-9\\s+]", "").replaceAll("&", "and").trim().replaceAll("[\\s+]",
            "-");
    urlTitle = urlTitle.replaceAll("[^a-zA-Z0-9\\s+]", "").replaceAll("&", "and").trim().replaceAll("[\\s+]",
            "-");
    String url = String.format("http://genius.com/%s-%s-lyrics", urlArtist, urlTitle);
    return fromURL(url, originalArtist, originalTitle);
}

From source file:nl.utwente.bigdata.PlayersTweets.java

public static String deAccent(String str) {
    String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD);
    Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
    return pattern.matcher(nfdNormalizedString).replaceAll("");
}

From source file:com.beligum.core.utils.Toolkit.java

public static String normalizeString(String input) {
    return Normalizer.normalize(input, Normalizer.Form.NFD).replaceAll("[^\\p{ASCII}]", "");
}

From source file:net.sf.sprockets.database.sqlite.SQLite.java

/**
 * Remove diacritics from the string and convert it to upper case.
 *///from  w  w w .  j a v a  2s  . c o  m
public static String normalise(String s) {
    if (sDiacritics == null) {
        sDiacritics = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
    }
    return sDiacritics.matcher(Normalizer.normalize(s, NFD)).replaceAll("").toUpperCase(US);
}

From source file:org.drugis.addis.presentation.SMAASerializer.java

public static String toSlug(String input) {
    String nowhitespace = WHITESPACE.matcher(input).replaceAll("-");
    String normalized = Normalizer.normalize(nowhitespace, Form.NFD);
    String slug = NONLATIN.matcher(normalized).replaceAll("");
    return slug.toLowerCase(Locale.ENGLISH);
}

From source file:com.github.bfour.fpliteraturecollector.service.FileStorageService.java

private String getFileNameForLiterature(Literature lit) {

    // take title, removing all special characters
    String name = Normalizer.normalize(lit.getTitle(), Normalizer.Form.NFD).replaceAll("[^\\p{ASCII}]", "");

    name = name.replaceAll("[^A-z\\s]", "");

    // remove unnecessary words
    name = name.replaceAll("\\sa\\s", " ");
    name = name.replaceAll("\\sthe\\s", " ");
    name = name.replaceAll("\\sA\\s", " ");
    name = name.replaceAll("\\sThe\\s", " ");

    // trim/*www  .  jav  a2 s . c  om*/
    if (name.length() > 68)
        name = name.substring(0, 68);

    // add kind-of GUID
    name += "_" + Long.toHexString(new Date().getTime());

    return name;

}

From source file:com.joliciel.jochre.lexicon.DefaultLexiconWrapper.java

String toUpperCaseNoAccents(String string) {
     // decompose accents
     String decomposed = Normalizer.normalize(string, Form.NFD);
     // removing diacritics
     String removed = decomposed.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");

     String uppercase = removed.toUpperCase(JochreSession.getLocale());
     return uppercase;
 }

From source file:com.joliciel.talismane.languageDetector.LanguageDetectorImpl.java

@Override
public List<WeightedOutcome<Locale>> detectLanguages(String text) {
    MONITOR.startTask("detectLanguages");
    try {//from w  ww . j  ava2 s.  c o  m

        if (LOG.isTraceEnabled()) {
            LOG.trace("Testing text: " + text);
        }

        text = text.toLowerCase(Locale.ENGLISH);
        text = Normalizer.normalize(text, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");

        List<FeatureResult<?>> featureResults = new ArrayList<FeatureResult<?>>();
        for (LanguageDetectorFeature<?> feature : features) {
            RuntimeEnvironment env = this.featureService.getRuntimeEnvironment();
            FeatureResult<?> featureResult = feature.check(text, env);
            if (featureResult != null)
                featureResults.add(featureResult);
        }
        if (LOG.isTraceEnabled()) {
            for (FeatureResult<?> result : featureResults) {
                LOG.trace(result.toString());
            }
        }

        List<Decision<LanguageOutcome>> decisions = this.decisionMaker.decide(featureResults);
        if (LOG.isTraceEnabled()) {
            for (Decision<LanguageOutcome> decision : decisions) {
                LOG.trace(decision.getCode() + ": " + decision.getProbability());
            }
        }

        List<WeightedOutcome<Locale>> results = new ArrayList<WeightedOutcome<Locale>>();
        for (Decision<LanguageOutcome> decision : decisions) {
            Locale locale = Locale.forLanguageTag(decision.getOutcome().getCode());
            results.add(new WeightedOutcome<Locale>(locale, decision.getProbability()));
        }

        return results;
    } finally {
        MONITOR.endTask();
    }
}