Example usage for java.text Normalizer normalize

List of usage examples for java.text Normalizer normalize

Introduction

In this page you can find the example usage for java.text Normalizer normalize.

Prototype

public static String normalize(CharSequence src, Form form) 

Source Link

Document

Normalize a sequence of char values.

Usage

From source file:com.linkedpipes.plugin.loader.dcatAp11ToCkanBatch.DcatAp11ToCkanBatch.java

@Override
public void execute() throws LpException {

    apiURI = configuration.getApiUri();/* ww w.jav a2  s .co  m*/

    if (apiURI == null || apiURI.isEmpty() || configuration.getApiKey() == null
            || configuration.getApiKey().isEmpty()) {
        throw exceptionFactory.failure("Missing required settings.");
    }

    Map<String, String> organizations = getOrganizations();

    LOG.debug("Querying metadata for datasets");

    LinkedList<String> datasets = new LinkedList<>();
    for (Map<String, Value> map : executeSelectQuery(
            "SELECT ?d WHERE {?d a <" + DcatAp11ToCkanBatchVocabulary.DCAT_DATASET_CLASS + ">}")) {
        datasets.add(map.get("d").stringValue());
    }

    int current = 0;
    int total = datasets.size();

    LOG.info("Found " + total + " datasets");

    progressReport.start(total);

    for (String datasetURI : datasets) {
        current++;

        CloseableHttpResponse queryResponse = null;

        LOG.info("Processing dataset " + current + "/" + total + ": " + datasetURI);

        String datasetID = executeSimpleSelectQuery("SELECT ?did WHERE {<" + datasetURI + "> <"
                + DcatAp11ToCkanBatchVocabulary.LODCZCKAN_DATASET_ID + "> ?did }", "did");
        if (datasetID.isEmpty()) {
            LOG.warn("Dataset " + datasetURI + " has missing CKAN ID");
            continue;
        }

        boolean datasetExists = false;

        Map<String, String> resUrlIdMap = new HashMap<>();
        Map<String, String> resDistroIdMap = new HashMap<>();
        Map<String, JSONObject> resourceList = new HashMap<>();

        LOG.debug("Querying for the dataset " + datasetID + " in CKAN");
        HttpGet httpGet = new HttpGet(apiURI + "/package_show?id=" + datasetID);
        try {
            queryResponse = queryClient.execute(httpGet);
            if (queryResponse.getStatusLine().getStatusCode() == 200) {
                LOG.debug("Dataset found");
                datasetExists = true;

                JSONObject response = new JSONObject(EntityUtils.toString(queryResponse.getEntity()))
                        .getJSONObject("result");
                JSONArray resourcesArray = response.getJSONArray("resources");
                for (int i = 0; i < resourcesArray.length(); i++) {
                    String id = resourcesArray.getJSONObject(i).getString("id");
                    resourceList.put(id, resourcesArray.getJSONObject(i));

                    String url = resourcesArray.getJSONObject(i).getString("url");
                    resUrlIdMap.put(url, id);

                    if (resourcesArray.getJSONObject(i).has("distro_url")) {
                        String distro = resourcesArray.getJSONObject(i).getString("distro_url");
                        resDistroIdMap.put(distro, id);
                    }
                }
            } else {
                String ent = EntityUtils.toString(queryResponse.getEntity());
                LOG.debug("Dataset not found: " + ent);
            }
        } catch (Exception e) {
            LOG.error(e.getLocalizedMessage(), e);
        } finally {
            if (queryResponse != null) {
                try {
                    queryResponse.close();
                } catch (IOException e) {
                    LOG.error(e.getLocalizedMessage(), e);
                }
            }
        }

        LinkedList<String> keywords = new LinkedList<>();
        for (Map<String, Value> map : executeSelectQuery(
                "SELECT ?keyword WHERE {<" + datasetURI + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_KEYWORD
                        + "> ?keyword FILTER(LANGMATCHES(LANG(?keyword), \"" + configuration.getLoadLanguage()
                        + "\"))}")) {
            keywords.add(map.get("keyword").stringValue());
        }

        String publisher_uri = executeSimpleSelectQuery("SELECT ?publisher_uri WHERE {<" + datasetURI + "> <"
                + DCTERMS.PUBLISHER + "> ?publisher_uri }", "publisher_uri");
        String publisher_name = executeSimpleSelectQuery(
                "SELECT ?publisher_name WHERE {<" + datasetURI + "> <" + DCTERMS.PUBLISHER + ">/<" + FOAF.NAME
                        + "> ?publisher_name FILTER(LANGMATCHES(LANG(?publisher_name), \""
                        + configuration.getLoadLanguage() + "\"))}",
                "publisher_name");

        if (!organizations.containsKey(publisher_uri)) {
            LOG.debug("Creating organization " + publisher_uri);
            JSONObject root = new JSONObject();

            if (publisher_name == null || publisher_name.isEmpty()) {
                throw exceptionFactory.failure("Organization has no name: " + publisher_uri);
            }

            root.put("title", publisher_name);
            String orgname = Normalizer.normalize(publisher_name, Normalizer.Form.NFD)
                    .replaceAll("\\P{InBasic_Latin}", "").replace(' ', '-').replace('.', '-').toLowerCase();
            root.put("name", orgname);
            JSONArray org_extras = new JSONArray();
            org_extras.put(new JSONObject().put("key", "uri").put("value", publisher_uri));
            root.put("extras", org_extras);

            HttpPost httpPost = new HttpPost(apiURI + "/organization_create");
            httpPost.addHeader(new BasicHeader("Authorization", configuration.getApiKey()));

            String json = root.toString();

            httpPost.setEntity(new StringEntity(json, Charset.forName("utf-8")));

            CloseableHttpResponse response = null;

            try {
                response = postClient.execute(httpPost);
                if (response.getStatusLine().getStatusCode() == 200) {
                    LOG.debug("Organization created OK");
                    //LOG.info("Response: " + EntityUtils.toString(response.getEntity()));
                    organizations.put(publisher_uri, orgname);
                } else if (response.getStatusLine().getStatusCode() == 409) {
                    String ent = EntityUtils.toString(response.getEntity());
                    LOG.error("Organization conflict: " + ent);
                    throw exceptionFactory.failure("Organization conflict: " + ent);
                } else {
                    String ent = EntityUtils.toString(response.getEntity());
                    LOG.error("Response:" + ent);
                    throw exceptionFactory.failure("Error creating organization: " + ent);
                }
            } catch (Exception e) {
                LOG.error(e.getLocalizedMessage(), e);
            } finally {
                if (response != null) {
                    try {
                        response.close();
                    } catch (IOException e) {
                        LOG.error(e.getLocalizedMessage(), e);
                        throw exceptionFactory.failure("Error creating dataset");
                    }
                }
            }
        }

        LOG.debug("Creating JSON");

        JSONObject root = new JSONObject();

        JSONArray tags = new JSONArray();
        for (String keyword : keywords) {
            String safekeyword = fixKeyword(keyword);
            if (safekeyword.length() >= 2) {
                tags.put(new JSONObject().put("name", safekeyword));
            }
        }
        root.put("tags", tags);

        JSONArray resources = new JSONArray();

        if (!datasetID.isEmpty()) {
            root.put("name", datasetID);
        }

        String title = executeSimpleSelectQuery("SELECT ?title WHERE {<" + datasetURI + "> <" + DCTERMS.TITLE
                + "> ?title FILTER(LANGMATCHES(LANG(?title), \"" + configuration.getLoadLanguage() + "\"))}",
                "title");
        if (!title.isEmpty()) {
            root.put("title", title);
        }
        String description = executeSimpleSelectQuery("SELECT ?description WHERE {<" + datasetURI + "> <"
                + DCTERMS.DESCRIPTION + "> ?description FILTER(LANGMATCHES(LANG(?description), \""
                + configuration.getLoadLanguage() + "\"))}", "description");
        if (!description.isEmpty()) {
            root.put("notes", description);
        }
        String contactPoint = executeSimpleSelectQuery("SELECT ?contact WHERE {<" + datasetURI + "> <"
                + DcatAp11ToCkanBatchVocabulary.DCAT_CONTACT_POINT + ">/<"
                + DcatAp11ToCkanBatchVocabulary.VCARD_HAS_EMAIL + "> ?contact }", "contact");
        if (!contactPoint.isEmpty()) {
            root.put("maintainer_email", contactPoint);
        }
        String curatorName = executeSimpleSelectQuery(
                "SELECT ?name WHERE {<" + datasetURI + "> <" + DcatAp11ToCkanBatchVocabulary.DCAT_CONTACT_POINT
                        + ">/<" + DcatAp11ToCkanBatchVocabulary.VCARD_FN + "> ?name }",
                "name");
        if (!curatorName.isEmpty()) {
            root.put("maintainer", curatorName);
        }
        String issued = executeSimpleSelectQuery(
                "SELECT ?issued WHERE {<" + datasetURI + "> <" + DCTERMS.ISSUED + "> ?issued }", "issued");
        if (!issued.isEmpty()) {
            root.put("metadata_created", issued);
        }
        String modified = executeSimpleSelectQuery(
                "SELECT ?modified WHERE {<" + datasetURI + "> <" + DCTERMS.MODIFIED + "> ?modified }",
                "modified");
        if (!modified.isEmpty()) {
            root.put("metadata_modified", modified);
        }

        if (configuration.getProfile().equals(DcatAp11ToCkanBatchVocabulary.PROFILES_NKOD.stringValue())) {
            if (!publisher_uri.isEmpty()) {
                root.put("publisher_uri", publisher_uri);
            }
            if (!publisher_name.isEmpty()) {
                root.put("publisher_name", publisher_name);
            }

            String periodicity = executeSimpleSelectQuery("SELECT ?periodicity WHERE {<" + datasetURI + "> <"
                    + DCTERMS.ACCRUAL_PERIODICITY + "> ?periodicity }", "periodicity");
            if (!periodicity.isEmpty()) {
                root.put("frequency", periodicity);
            }
            String temporalStart = executeSimpleSelectQuery(
                    "SELECT ?temporalStart WHERE {<" + datasetURI + "> <" + DCTERMS.TEMPORAL + ">/<"
                            + DcatAp11ToCkanBatchVocabulary.SCHEMA_STARTDATE + "> ?temporalStart }",
                    "temporalStart");
            if (!temporalStart.isEmpty()) {
                root.put("temporal_start", temporalStart);
            }
            String temporalEnd = executeSimpleSelectQuery(
                    "SELECT ?temporalEnd WHERE {<" + datasetURI + "> <" + DCTERMS.TEMPORAL + ">/<"
                            + DcatAp11ToCkanBatchVocabulary.SCHEMA_ENDDATE + "> ?temporalEnd }",
                    "temporalEnd");
            if (!temporalEnd.isEmpty()) {
                root.put("temporal_end", temporalEnd);
            }
            String schemaURL = executeSimpleSelectQuery(
                    "SELECT ?schema WHERE {<" + datasetURI + "> <" + FOAF.PAGE + "> ?schema }", "schema");
            if (!schemaURL.isEmpty()) {
                root.put("schema", schemaURL);
            }
            String spatial = executeSimpleSelectQuery(
                    "SELECT ?spatial WHERE {<" + datasetURI + "> <" + DCTERMS.SPATIAL + "> ?spatial }",
                    "spatial");
            if (!spatial.isEmpty()) {
                root.put("spatial_uri", spatial);
            }
            LinkedList<String> themes = new LinkedList<>();
            for (Map<String, Value> map : executeSelectQuery("SELECT ?theme WHERE {<" + datasetURI + "> <"
                    + DcatAp11ToCkanBatchVocabulary.DCAT_THEME + "> ?theme }")) {
                themes.add(map.get("theme").stringValue());
            }
            String concatThemes = "";
            for (String theme : themes) {
                concatThemes += theme + " ";
            }
            if (!concatThemes.isEmpty())
                root.put("theme", concatThemes);

        }

        //Distributions

        LinkedList<String> distributions = new LinkedList<>();
        for (Map<String, Value> map : executeSelectQuery("SELECT ?distribution WHERE {<" + datasetURI + "> <"
                + DcatAp11ToCkanBatchVocabulary.DCAT_DISTRIBUTION + "> ?distribution }")) {
            distributions.add(map.get("distribution").stringValue());
        }

        for (String distribution : distributions) {
            JSONObject distro = new JSONObject();

            String dtitle = executeSimpleSelectQuery("SELECT ?title WHERE {<" + distribution + "> <"
                    + DCTERMS.TITLE + "> ?title FILTER(LANGMATCHES(LANG(?title), \""
                    + configuration.getLoadLanguage() + "\"))}", "title");
            if (!dtitle.isEmpty()) {
                distro.put("name", dtitle);
            }
            String ddescription = executeSimpleSelectQuery("SELECT ?description WHERE {<" + distribution + "> <"
                    + DCTERMS.DESCRIPTION + "> ?description FILTER(LANGMATCHES(LANG(?description), \""
                    + configuration.getLoadLanguage() + "\"))}", "description");
            if (!ddescription.isEmpty()) {
                distro.put("description", ddescription);
            }
            //DCAT-AP v1.1: has to be am IRI from http://publications.europa.eu/mdr/authority/file-type/index.html
            String dformat = executeSimpleSelectQuery(
                    "SELECT ?format WHERE {<" + distribution + "> <" + DCTERMS.FORMAT + "> ?format }",
                    "format");
            if (!dformat.isEmpty() && codelists != null) {
                String formatlabel = executeSimpleCodelistSelectQuery(
                        "SELECT ?formatlabel WHERE {<" + dformat + "> <" + SKOS.PREF_LABEL
                                + "> ?formatlabel FILTER(LANGMATCHES(LANG(?formatlabel), \"en\"))}",
                        "formatlabel");
                if (!formatlabel.isEmpty()) {
                    distro.put("format", formatlabel);
                }
            }

            String dwnld = executeSimpleSelectQuery("SELECT ?dwnld WHERE {<" + distribution + "> <"
                    + DcatAp11ToCkanBatchVocabulary.DCAT_DOWNLOADURL + "> ?dwnld }", "dwnld");
            String access = executeSimpleSelectQuery("SELECT ?acc WHERE {<" + distribution + "> <"
                    + DcatAp11ToCkanBatchVocabulary.DCAT_ACCESSURL + "> ?acc }", "acc");

            //we prefer downloadURL, but only accessURL is mandatory
            if (dwnld == null || dwnld.isEmpty()) {
                dwnld = access;
                if (dwnld == null || dwnld.isEmpty()) {
                    LOG.warn("Empty download and access URLs: " + datasetURI);
                    continue;
                }
            }

            if (!dwnld.isEmpty()) {
                distro.put("url", dwnld);
            }
            if (!distribution.isEmpty()) {
                distro.put("distro_url", distribution);
            }

            distro.put("resource_type", "file");

            if (resDistroIdMap.containsKey(distribution)) {
                String id = resDistroIdMap.get(distribution);
                distro.put("id", id);
                resourceList.remove(id);
            } else if (resUrlIdMap.containsKey(dwnld)) {
                String id = resUrlIdMap.get(dwnld);
                distro.put("id", id);
                resourceList.remove(id);
            }

            String dissued = executeSimpleSelectQuery(
                    "SELECT ?issued WHERE {<" + distribution + "> <" + DCTERMS.ISSUED + "> ?issued }",
                    "issued");
            if (!dissued.isEmpty()) {
                distro.put("created", dissued);
            }
            String dmodified = executeSimpleSelectQuery(
                    "SELECT ?modified WHERE {<" + distribution + "> <" + DCTERMS.MODIFIED + "> ?modified }",
                    "modified");
            if (!dmodified.isEmpty()) {
                distro.put("last_modified", dmodified);
            }

            if (configuration.getProfile().equals(DcatAp11ToCkanBatchVocabulary.PROFILES_NKOD.stringValue())) {
                String dtemporalStart = executeSimpleSelectQuery(
                        "SELECT ?temporalStart WHERE {<" + distribution + "> <" + DCTERMS.TEMPORAL + ">/<"
                                + DcatAp11ToCkanBatchVocabulary.SCHEMA_STARTDATE + "> ?temporalStart }",
                        "temporalStart");
                if (!dtemporalStart.isEmpty()) {
                    distro.put("temporal_start", dtemporalStart);
                }
                String dtemporalEnd = executeSimpleSelectQuery(
                        "SELECT ?temporalEnd WHERE {<" + distribution + "> <" + DCTERMS.TEMPORAL + ">/<"
                                + DcatAp11ToCkanBatchVocabulary.SCHEMA_ENDDATE + "> ?temporalEnd }",
                        "temporalEnd");
                if (!dtemporalEnd.isEmpty()) {
                    distro.put("temporal_end", dtemporalEnd);
                }
                String dspatial = executeSimpleSelectQuery(
                        "SELECT ?spatial WHERE {<" + distribution + "> <" + DCTERMS.SPATIAL + "> ?spatial }",
                        "spatial");
                if (!dspatial.isEmpty()) {
                    root.put("spatial_uri", dspatial);
                }
                String dschemaURL = executeSimpleSelectQuery(
                        "SELECT ?schema WHERE {<" + distribution + "> <" + DCTERMS.CONFORMS_TO + "> ?schema }",
                        "schema");
                if (!dschemaURL.isEmpty()) {
                    distro.put("describedBy", dschemaURL);
                }
                String dlicense = executeSimpleSelectQuery(
                        "SELECT ?license WHERE {<" + distribution + "> <" + DCTERMS.LICENSE + "> ?license }",
                        "license");
                if (!dlicense.isEmpty()) {
                    distro.put("license_link", dlicense);
                }
                String dmimetype = executeSimpleSelectQuery("SELECT ?format WHERE {<" + distribution + "> <"
                        + DcatAp11ToCkanBatchVocabulary.DCAT_MEDIATYPE + "> ?format }", "format");
                if (!dmimetype.isEmpty()) {
                    distro.put("mimetype", dmimetype.replaceAll(".*\\/([^\\/]+\\/[^\\/]+)", "$1"));
                }
            }

            resources.put(distro);
        }

        //Add the remaining distributions that were not updated but existed in the original dataset
        for (Entry<String, JSONObject> resource : resourceList.entrySet()) {
            resources.put(resource.getValue());
        }

        root.put("resources", resources);

        //Create new dataset
        if (!datasetExists) {
            JSONObject createRoot = new JSONObject();
            CloseableHttpResponse response = null;

            createRoot.put("name", datasetID);
            createRoot.put("title", title);
            createRoot.put("owner_org", organizations.get(publisher_uri));

            LOG.debug("Creating dataset in CKAN");
            HttpPost httpPost = new HttpPost(apiURI + "/package_create?id=" + datasetID);
            httpPost.addHeader(new BasicHeader("Authorization", configuration.getApiKey()));

            String json = createRoot.toString();

            LOG.debug("Creating dataset with: " + json);

            httpPost.setEntity(new StringEntity(json, Charset.forName("utf-8")));

            try {
                response = createClient.execute(httpPost);
                if (response.getStatusLine().getStatusCode() == 200) {
                    LOG.debug("Dataset created OK");
                    //LOG.info("Response: " + EntityUtils.toString(response.getEntity()));
                } else if (response.getStatusLine().getStatusCode() == 409) {
                    String ent = EntityUtils.toString(response.getEntity());
                    LOG.error("Dataset already exists: " + ent);
                    throw exceptionFactory.failure("Dataset already exists");
                } else {
                    String ent = EntityUtils.toString(response.getEntity());
                    LOG.error("Response:" + ent);
                    throw exceptionFactory.failure("Error creating dataset");
                }
            } catch (Exception e) {
                LOG.error(e.getLocalizedMessage(), e);
            } finally {
                if (response != null) {
                    try {
                        response.close();
                    } catch (IOException e) {
                        LOG.error(e.getLocalizedMessage(), e);
                        throw exceptionFactory.failure("Error creating dataset");
                    }
                }
            }
        }

        //Update existing dataset
        String json = root.toString();
        LOG.debug("Posting to CKAN");
        HttpPost httpPost = new HttpPost(apiURI + "/package_update?id=" + datasetID);
        httpPost.addHeader(new BasicHeader("Authorization", configuration.getApiKey()));

        LOG.debug(json);

        httpPost.setEntity(new StringEntity(json, Charset.forName("utf-8")));
        CloseableHttpResponse response = null;

        try {
            response = postClient.execute(httpPost);
            if (response.getStatusLine().getStatusCode() == 200) {
                //LOG.info("Response:" + EntityUtils.toString(response.getEntity()));
            } else {
                String ent = EntityUtils.toString(response.getEntity());
                LOG.error("Response:" + ent);
                throw exceptionFactory.failure("Error updating dataset");
            }
        } catch (Exception e) {
            LOG.error(e.getLocalizedMessage(), e);
        } finally {
            if (response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                    LOG.error(e.getLocalizedMessage(), e);
                    throw exceptionFactory.failure("Error updating dataset");
                }
            }
        }

        progressReport.entryProcessed();
    }

    try {
        queryClient.close();
        createClient.close();
        postClient.close();
    } catch (IOException e) {
        LOG.error(e.getLocalizedMessage(), e);
    }

    progressReport.done();

}

From source file:io.github.swagger2markup.markup.builder.internal.AbstractMarkupDocBuilder.java

protected String normalizeAnchor(Markup spaceEscape, String anchor) {
    String normalizedAnchor = defaultString(anchorPrefix) + anchor.trim();
    normalizedAnchor = Normalizer.normalize(normalizedAnchor, Normalizer.Form.NFD)
            .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    normalizedAnchor = ANCHOR_IGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll(spaceEscape.toString());
    normalizedAnchor = normalizedAnchor/*from  w w  w  .  j  a  v a2s. co m*/
            .replaceAll(String.format("([%1$s])([%1$s]+)", ANCHOR_SEPARATION_CHARACTERS), "$1");
    normalizedAnchor = StringUtils.strip(normalizedAnchor, ANCHOR_SEPARATION_CHARACTERS);
    normalizedAnchor = normalizedAnchor.trim().toLowerCase();

    String validAnchor = ANCHOR_UNIGNORABLE_PATTERN.matcher(normalizedAnchor).replaceAll("");
    if (validAnchor.length() != normalizedAnchor.length())
        normalizedAnchor = DigestUtils.md5Hex(normalizedAnchor);
    else
        normalizedAnchor = validAnchor;

    return normalizedAnchor;
}

From source file:com.ved.musicmapapp.LoginAcitivity.java

public String unAccent(String s) {
    String temp = Normalizer.normalize(s, Normalizer.Form.NFD);
    Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
    return pattern.matcher(temp).replaceAll("").replaceAll("?", "D").replace("", "");
}

From source file:com.joliciel.talismane.tokeniser.filters.TokenRegexFilterImpl.java

Pattern getPattern() {
    if (pattern == null) {
        // we may need to replace WordLists by the list contents
        String myRegex = this.regex;

        if (LOG.isTraceEnabled()) {
            LOG.trace("Regex: " + myRegex);
        }//www  . j av a  2 s .c  o m

        if (this.autoWordBoundaries) {
            Boolean startsWithLetter = null;
            for (int i = 0; i < myRegex.length() && startsWithLetter == null; i++) {
                char c = myRegex.charAt(i);
                if (c == '\\') {
                    i++;
                    c = myRegex.charAt(i);
                    if (c == 'd' || c == 'w') {
                        startsWithLetter = true;
                    } else if (c == 's' || c == 'W' || c == 'b' || c == 'B') {
                        startsWithLetter = false;
                    } else if (c == 'p') {
                        i += 2; // skip the open curly brackets
                        int closeCurlyBrackets = myRegex.indexOf('}', i);
                        int openParentheses = myRegex.indexOf('(', i);
                        int endIndex = closeCurlyBrackets;
                        if (openParentheses > 0 && openParentheses < closeCurlyBrackets)
                            endIndex = openParentheses;
                        if (endIndex > 0) {
                            String specialClass = myRegex.substring(i, endIndex);
                            if (specialClass.equals("WordList")) {
                                startsWithLetter = true;
                            }
                        }
                    }
                    break;
                } else if (c == '[' || c == '(') {
                    // do nothing
                } else if (Character.isLetter(c) || Character.isDigit(c)) {
                    startsWithLetter = true;
                } else {
                    startsWithLetter = false;
                }
            }

            Boolean endsWithLetter = null;
            for (int i = myRegex.length() - 1; i >= 0 && endsWithLetter == null; i--) {
                char c = myRegex.charAt(i);
                char prevC = ' ';
                if (i >= 1)
                    prevC = myRegex.charAt(i - 1);
                if (prevC == '\\') {
                    if (c == 'd' || c == 'w') {
                        endsWithLetter = true;
                    } else if (c == 's' || c == 'W' || c == 'b' || c == 'B') {
                        endsWithLetter = false;
                    } else if (c == 'p') {
                        i += 2; // skip the open curly brackets
                        int closeCurlyBrackets = myRegex.indexOf('}', i);
                        int openParentheses = myRegex.indexOf('(', i);
                        int endIndex = closeCurlyBrackets;
                        if (openParentheses < closeCurlyBrackets)
                            endIndex = openParentheses;
                        if (endIndex > 0) {
                            String specialClass = myRegex.substring(i, endIndex);
                            if (specialClass.equals("WordList") || specialClass.equals("Alpha")
                                    || specialClass.equals("Lower") || specialClass.equals("Upper")
                                    || specialClass.equals("ASCII") || specialClass.equals("Digit")) {
                                startsWithLetter = true;
                            }
                        }
                    }
                    break;
                } else if (c == ']' || c == ')' || c == '+') {
                    // do nothing
                } else if (c == '}') {
                    int startIndex = myRegex.lastIndexOf('{') + 1;
                    int closeCurlyBrackets = myRegex.indexOf('}', startIndex);
                    int openParentheses = myRegex.indexOf('(', startIndex);
                    int endIndex = closeCurlyBrackets;
                    if (openParentheses > 0 && openParentheses < closeCurlyBrackets)
                        endIndex = openParentheses;
                    if (endIndex > 0) {
                        String specialClass = myRegex.substring(startIndex, endIndex);
                        if (specialClass.equals("WordList") || specialClass.equals("Alpha")
                                || specialClass.equals("Lower") || specialClass.equals("Upper")
                                || specialClass.equals("ASCII") || specialClass.equals("Digit")) {
                            endsWithLetter = true;
                        }
                    }
                    break;
                } else if (Character.isLetter(c) || Character.isDigit(c)) {
                    endsWithLetter = true;
                } else {
                    endsWithLetter = false;
                }
            }

            if (startsWithLetter != null && startsWithLetter) {
                myRegex = "\\b" + myRegex;
            }
            if (endsWithLetter != null && endsWithLetter) {
                myRegex = myRegex + "\\b";
            }
            if (LOG.isTraceEnabled()) {
                LOG.trace("After autoWordBoundaries: " + myRegex);
            }
        }

        if (!this.caseSensitive || !this.diacriticSensitive) {
            StringBuilder regexBuilder = new StringBuilder();
            for (int i = 0; i < myRegex.length(); i++) {
                char c = myRegex.charAt(i);
                if (c == '\\') {
                    // escape - skip next
                    regexBuilder.append(c);
                    i++;
                    c = myRegex.charAt(i);
                    regexBuilder.append(c);
                } else if (c == '[') {
                    // character group, don't change it
                    regexBuilder.append(c);
                    while (c != ']' && i < myRegex.length()) {
                        i++;
                        c = myRegex.charAt(i);
                        regexBuilder.append(c);
                    }
                } else if (c == '{') {
                    // command, don't change it
                    regexBuilder.append(c);
                    while (c != '}' && i < myRegex.length()) {
                        i++;
                        c = myRegex.charAt(i);
                        regexBuilder.append(c);
                    }
                } else if (Character.isLetter(c)) {
                    Set<String> chars = new TreeSet<String>();
                    chars.add("" + c);
                    char noAccent = diacriticPattern.matcher(Normalizer.normalize("" + c, Form.NFD))
                            .replaceAll("").charAt(0);

                    if (!this.caseSensitive) {
                        chars.add("" + Character.toUpperCase(c));
                        chars.add("" + Character.toLowerCase(c));
                        chars.add("" + Character.toUpperCase(noAccent));
                    }
                    if (!this.diacriticSensitive) {
                        chars.add("" + noAccent);
                        if (!this.caseSensitive) {
                            chars.add("" + Character.toLowerCase(noAccent));
                        }
                    }
                    if (chars.size() == 1) {
                        regexBuilder.append(c);
                    } else {
                        regexBuilder.append('[');
                        for (String oneChar : chars) {
                            regexBuilder.append(oneChar);
                        }
                        regexBuilder.append(']');
                    }
                } else {
                    regexBuilder.append(c);
                }
            }
            myRegex = regexBuilder.toString();
            if (LOG.isTraceEnabled()) {
                LOG.trace("After caseSensitive: " + myRegex);
            }
        }

        Matcher matcher = wordListPattern.matcher(myRegex);
        StringBuilder regexBuilder = new StringBuilder();

        int lastIndex = 0;
        while (matcher.find()) {
            String[] params = matcher.group(1).split(",");
            int start = matcher.start();
            int end = matcher.end();
            regexBuilder.append(myRegex.substring(lastIndex, start));

            String wordListName = params[0];
            boolean uppercaseOptional = false;
            boolean diacriticsOptional = false;
            boolean lowercaseOptional = false;
            boolean firstParam = true;
            for (String param : params) {
                if (firstParam) {
                    /* word list name */ } else if (param.equals("diacriticsOptional"))
                    diacriticsOptional = true;
                else if (param.equals("uppercaseOptional"))
                    uppercaseOptional = true;
                else if (param.equals("lowercaseOptional"))
                    lowercaseOptional = true;
                else
                    throw new TalismaneException(
                            "Unknown parameter in word list " + matcher.group(1) + ": " + param);
                firstParam = false;
            }

            ExternalWordList wordList = externalResourceFinder.getExternalWordList(wordListName);
            if (wordList == null)
                throw new TalismaneException("Unknown word list: " + wordListName);

            StringBuilder sb = new StringBuilder();

            boolean firstWord = true;
            for (String word : wordList.getWordList()) {
                if (!firstWord)
                    sb.append("|");
                word = Normalizer.normalize(word, Form.NFC);
                if (uppercaseOptional || diacriticsOptional) {
                    String wordNoDiacritics = Normalizer.normalize(word, Form.NFD)
                            .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
                    String wordLowercase = word.toLowerCase(Locale.ENGLISH);
                    String wordLowercaseNoDiacritics = Normalizer.normalize(wordLowercase, Form.NFD)
                            .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
                    String wordUppercase = wordNoDiacritics.toUpperCase(Locale.ENGLISH);

                    boolean needsGrouping = false;
                    if (uppercaseOptional && !word.equals(wordLowercase))
                        needsGrouping = true;
                    if (diacriticsOptional && !word.equals(wordNoDiacritics))
                        needsGrouping = true;
                    if (lowercaseOptional && !word.equals(wordUppercase))
                        needsGrouping = true;
                    if (needsGrouping) {
                        for (int i = 0; i < word.length(); i++) {
                            char c = word.charAt(i);

                            boolean grouped = false;
                            if (uppercaseOptional && c != wordLowercase.charAt(i))
                                grouped = true;
                            if (diacriticsOptional && c != wordNoDiacritics.charAt(i))
                                grouped = true;
                            if (lowercaseOptional && c != wordUppercase.charAt(i))
                                grouped = true;

                            if (!grouped)
                                sb.append(c);
                            else {
                                sb.append("[");
                                String group = "" + c;
                                if (uppercaseOptional && group.indexOf(wordLowercase.charAt(i)) < 0)
                                    group += (wordLowercase.charAt(i));
                                if (lowercaseOptional && group.indexOf(wordUppercase.charAt(i)) < 0)
                                    group += (wordUppercase.charAt(i));
                                if (diacriticsOptional && group.indexOf(wordNoDiacritics.charAt(i)) < 0)
                                    group += (wordNoDiacritics.charAt(i));
                                if (uppercaseOptional && diacriticsOptional
                                        && group.indexOf(wordLowercaseNoDiacritics.charAt(i)) < 0)
                                    group += (wordLowercaseNoDiacritics.charAt(i));

                                sb.append(group);
                                sb.append("]");
                            } // does this letter need grouping?
                        } // next letter
                    } else {
                        sb.append(word);
                    } // any options activated?
                } else {
                    sb.append(word);
                }
                firstWord = false;
            } // next word in list

            regexBuilder.append(sb.toString());
            lastIndex = end;
        } // next match
        regexBuilder.append(myRegex.substring(lastIndex));
        myRegex = regexBuilder.toString();
        this.pattern = Pattern.compile(myRegex, Pattern.UNICODE_CHARACTER_CLASS);
    }
    return pattern;
}

From source file:org.rascalmpl.library.cobra.RandomValueTypeVisitor.java

@Override
public IValue visitString(Type type) {
    if (maxDepth <= 0 || (stRandom.nextInt(2) == 0)) {
        return vf.string("");
    } else {//  w  ww  . j ava 2  s .  c o m
        RandomValueTypeVisitor visitor = descend();
        IString str = vf.string(visitor.generate(type).toString());
        IString result = str.concat(vf.string(RandomStringUtils.random(1)));
        // make sure we are not generating very strange sequences
        String normalized = Normalizer.normalize(result.getValue(), Form.NFC);
        return vf.string(normalized);
    }
}

From source file:org.openremote.foxycart.resources.FoxyCartResource.java

private String removeDiacritics(String text) {
    // characters     

    text = text.replace("\u00E4", "ae");
    text = text.replace("\u00F1", "ny");
    text = text.replace("\u00F6", "oe");
    text = text.replace("\u00FC", "ue");
    text = text.replace("\u00FF", "yu");

    text = Normalizer.normalize(text, Normalizer.Form.NFD);
    text = text.replaceAll("\\p{M}", "");

    text = text.replace("\u00DF", "ss");
    text = text.replace("\u00C6", "AE");
    text = text.replace("\u00E6", "ae");
    text = text.replace("\u0132", "IJ");
    text = text.replace("\u0133", "ij");
    text = text.replace("\u0152", "Oe");
    text = text.replace("\u0153", "oe");

    // ??/*from  w  w w . ja  v  a 2 s . co m*/

    text = text.replace("\u00D0", "D");
    text = text.replace("\u0110", "D");
    text = text.replace("\u00F0", "d");
    text = text.replace("\u0111", "d");
    text = text.replace("\u0126", "H");
    text = text.replace("\u0127", "h");

    //  ?

    text = text.replace("\u0131", "i");
    text = text.replace("\u0138", "k");
    text = text.replace("\u013F", "L");
    text = text.replace("\u0141", "L");
    text = text.replace("\u0140", "l");
    text = text.replace("\u0142", "l");

    // 

    text = text.replace("\u014A", "N");
    text = text.replace("\u0149", "n");
    text = text.replace("\u014B", "n");
    text = text.replace("\u00D8", "O");
    text = text.replace("\u00F8", "o");
    text = text.replace("\u017F", "s");

    // 

    text = text.replace("\u00DE", "T");
    text = text.replace("\u0166", "T");
    text = text.replace("\u00FE", "t");
    text = text.replace("\u0167", "t");

    return text;
}

From source file:org.dspace.installer_edm.InstallerEDMBase.java

/**
 * quitar acentos a una cadena// w ww  .j a  v  a 2  s  . co m
 *
 * @param text cadena original
 * @return cadena sin acentos
 */
public String removeAccents(String text) {
    if (text != null)
        text = text.replaceAll(" +", "_");
    return text == null ? null
            : Normalizer.normalize(text, Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
}

From source file:org.structr.core.graph.search.Search.java

/**
 * Normalize special characters to ASCII
 *
 * @param input//from  w  w  w  .j a va2  s.c o m
 * @return
 */
public static String normalize(final String input) {

    String normalized = Normalizer.normalize(input, Normalizer.Form.NFD);

    return normalized.replaceAll("[^\\p{ASCII}]", "");

}

From source file:de.pangaea.fixo3.xml.ProcessXmlFiles.java

private String toAscii(String s) {
    s = Normalizer.normalize(s, Normalizer.Form.NFD);
    return s.replaceAll("[^\\x00-\\x7F]", "");
}

From source file:com.cloudbees.hudson.plugins.folder.ChildNameGeneratorTest.java

private void checkComputedFolder(ComputedFolderImpl instance, int round, Normalizer.Form form)
        throws IOException {
    assertThat("We detected the filesystem normalization form", form, notNullValue());
    instance.assertItemNames(round, "$$child-one", "$$child_two", "$$child three", "$$leanbh cu\u0301ig",
            "$$ ?", "$$", "$$\u110b\u1161\u110b\u1175 7", "$$nin\u0303o ocho");
    instance.assertItemShortUrls(round, "job/$$child-one/", "job/$$child_two/", "job/$$child%20three/",
            "job/$$leanbh%20cu%CC%81ig/",
            "job/$$%D1%80%D0%B5%D0%B1%D0%B5%D0%BD%D0%BE%D0%BA%20%D0%BF%D1%8F%D1%82%D1%8C/", //  ?
            "job/$$%E5%84%BF%E7%AB%A5%E5%85%AD/", // 
            "job/$$%E1%84%8B%E1%85%A1%E1%84%8B%E1%85%B5%207/", // ? 7
            "job/$$nin%CC%83o%20ocho/");
    switch (form) {
    case NFC:/*from w w  w .j a va2 s  .c o  m*/
    case NFKC:
        instance.assertItemDirs(round, "child_on-1ec93354e47959489d1440d", "child_tw-bca7d461e11f4f3ed12fd0d",
                "child_th-b7a6e5662f26eb036090308", "leanbh_c-cde398abd1bc432e87c49ca",
                "________-97e4b38574769f9d9968fe9", //  ?
                "___-d22e9fe51690274d8262bda", // 
                "_____7-d57fff123224bd679e4213b", // ? 7
                "nin_o_oc-1a0c91070942136ba398919");
        break;
    case NFD:
    case NFKD:
        instance.assertItemDirs(round, "child_on-1ec93354e47959489d1440d", "child_tw-bca7d461e11f4f3ed12fd0d",
                "child_th-b7a6e5662f26eb036090308", "leanbh_c-66fe5ac0be4a896280ef09f",
                "________-97e4b38574769f9d9968fe9", //  ?
                "___-d22e9fe51690274d8262bda", // 
                "_____7-6d2219439eec0df19863ab8", // ? 7
                "nin_o_oc-782e3bad2d233732a03f9dd");
        break;
    }
    for (String name : Arrays.asList("child-one", "child_two", "child three", "leanbh cig",
            " ?", "", "? 7", "nio ocho")) {
        checkChild(instance, Normalizer.normalize(name, form));
    }
}