Example usage for java.lang Character isLowerCase

List of usage examples for java.lang Character isLowerCase

Introduction

In this page you can find the example usage for java.lang Character isLowerCase.

Prototype

public static boolean isLowerCase(int codePoint) 

Source Link

Document

Determines if the specified character (Unicode code point) is a lowercase character.

Usage

From source file:de.tudarmstadt.ukp.dkpro.spelling.experiments.errormining.SpellingErrorFilter.java

private boolean haveFirstLettersSameCase(char char1, char char2) {
    if (Character.isUpperCase(char1) && Character.isLowerCase(char2)
            || Character.isLowerCase(char1) && Character.isUpperCase(char2)) {
        return false;
    }/*from  w w  w . j a va 2s .  c  o  m*/

    return true;
}

From source file:com.sfs.whichdoctor.dao.AddressVerificationDAOImpl.java

/**
 * Checks if the string is all upper case.
 *
 * @param s the string//from   w w w.ja  v  a2  s. c  om
 * @return true, if is all upper
 */
private static boolean isAllUpper(String s) {
    for (char c : s.toCharArray()) {
        if (Character.isLetter(c) && Character.isLowerCase(c)) {
            return false;
        }
    }
    return true;
}

From source file:org.languagetool.rules.de.CaseRule.java

@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
    List<RuleMatch> ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();

    boolean prevTokenIsDas = false;
    boolean isPrecededByModalOrAuxiliary = false;
    for (int i = 0; i < tokens.length; i++) {
        //Note: defaulting to the first analysis is only save if we only query for sentence start
        String posToken = tokens[i].getAnalyzedToken(0).getPOSTag();
        if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posToken)) {
            continue;
        }/*from   w ww  . j  a va 2s .  c om*/
        if (i == 1) { // don't care about first word, UppercaseSentenceStartRule does this already
            prevTokenIsDas = nounIndicators.contains(tokens[1].getToken().toLowerCase());
            continue;
        }
        if (i > 0 && isSalutation(tokens[i - 1].getToken())) { // e.g. "Frau Stieg" could be a name, ignore
            continue;
        }
        AnalyzedTokenReadings analyzedToken = tokens[i];
        String token = analyzedToken.getToken();

        boolean isBaseform = analyzedToken.getReadingsLength() >= 1 && analyzedToken.hasLemma(token);
        if ((analyzedToken.getAnalyzedToken(0).getPOSTag() == null
                || GermanHelper.hasReadingOfType(analyzedToken, GermanToken.POSType.VERB)) && isBaseform) {
            boolean nextTokenIsPersonalOrReflexivePronoun = false;
            if (i < tokens.length - 1) {
                AnalyzedTokenReadings nextToken = tokens[i + 1];
                // avoid false alarm for "Das haben wir getan." etc:
                nextTokenIsPersonalOrReflexivePronoun = nextToken.hasPartialPosTag("PRO:PER")
                        || StringUtils.equalsAny(nextToken.getToken(), "sich", "Sie");
                if (nextToken.hasPosTag("PKT")) {
                    // avoid false alarm for "So sollte das funktionieren." (might also remove true alarms...)
                    continue;
                }
                if (prevTokenIsDas
                        && (DAS_VERB_EXCEPTIONS.contains(nextToken.getToken())
                                || isFollowedByRelativeOrSubordinateClause(i, tokens))
                        || (i > 1 && hasPartialTag(tokens[i - 2], "VER:AUX", "VER:MOD"))) {
                    // avoid false alarm for "Er kann ihr das bieten, was sie verdient."
                    // avoid false alarm for "Das wissen die meisten." / "Um das sagen zu knnen, ..."
                    // avoid false alarm for "Du musst/solltest/knntest das wissen, damit du die Prfung bestehst / weil wir das gestern besprochen haben."
                    // avoid false alarm for "Wir werden das stoppen."
                    // avoid false alarm for "Wahre Liebe muss das aushalten."
                    continue;
                }
            }
            if (isPrevProbablyRelativePronoun(tokens, i)
                    || (prevTokenIsDas && getTokensWithPartialPosTagCount(tokens, "VER") == 1)) {// ignore sentences containing a single verb, e.g., "Das wissen viele nicht."
                continue;
            }
            potentiallyAddLowercaseMatch(ruleMatches, tokens[i], prevTokenIsDas, token,
                    nextTokenIsPersonalOrReflexivePronoun, sentence);
        }
        prevTokenIsDas = nounIndicators.contains(tokens[i].getToken().toLowerCase());
        if (analyzedToken.matchesPosTagRegex("VER:(MOD|AUX):[1-3]:.*")) {
            isPrecededByModalOrAuxiliary = true;
        }
        AnalyzedTokenReadings lowercaseReadings = tagger.lookup(token.toLowerCase());
        if (hasNounReading(analyzedToken)) { // it's the spell checker's task to check that nouns are uppercase
            if (!isPotentialUpperCaseError(i, tokens, lowercaseReadings, isPrecededByModalOrAuxiliary)) {
                continue;
            }
        } else if (analyzedToken.hasPosTagStartingWith("SUB:") && i < tokens.length - 1
                && Character.isLowerCase(tokens[i + 1].getToken().charAt(0))
                && tokens[i + 1].matchesPosTagRegex("VER:[123]:.+")) {
            // "Viele Minderjhrige sind" but not "Das wirklich Wichtige Verfahren ist"
            continue;
        }
        if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && lowercaseReadings == null) {
            continue;
        }
        if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && lowercaseReadings != null
                && (lowercaseReadings.getAnalyzedToken(0).getPOSTag() == null
                        || analyzedToken.getToken().endsWith("innen"))) {
            continue; // unknown word, probably a name etc.
        }
        potentiallyAddUppercaseMatch(ruleMatches, tokens, i, analyzedToken, token, lowercaseReadings, sentence);
    }
    return toRuleMatchArray(ruleMatches);
}

From source file:it.cnr.isti.hpc.dexter.disambiguation.TurkishEntityDisambiguator.java

@Override
public EntityMatchList disambiguate(DexterLocalParams localParams, SpotMatchList sml) {
    entityScoreMap = new HashMap<String, EntityScores>();
    selectedEntities = new HashSet<String>();
    Multiset<String> entityFrequencyMultiset = HashMultiset.create();

    EntityMatchList entities = sml.getEntities();
    String inputText = localParams.getParams().get("text");
    String algorithm = Property.getInstance().get("algorithm");

    String ambigious = Property.getInstance().get("algorithm.ambigious");

    List<Token> inputTokens = Zemberek.getInstance().disambiguateFindTokens(inputText, false, true);
    List<Double> documentVector = DescriptionEmbeddingAverage.getAverageVectorList(inputText);
    Multiset<String> inputTokensMultiset = HashMultiset.create();
    for (Token token : inputTokens) {
        inputTokensMultiset.add(token.getMorphText());
    }//from  w w  w . j  a v  a2 s.  co m

    Multiset<String> domainMultiset = HashMultiset.create();
    Multiset<String> typeMultiset = HashMultiset.create();
    HashMap<String, Double> entitySimMap = new HashMap<String, Double>();
    // if (printCandidateEntities) {
    // printEntities(entities);
    // }
    HashSet<String> words = new HashSet<String>();
    Multiset<String> leskWords = HashMultiset.create();

    // first pass for finding number of types and domains
    for (int i = 0; i < entities.size(); i++) {
        EntityMatch em = entities.get(i);
        String id = em.getId();
        if (!entityFrequencyMultiset.contains(id)) {
            entityFrequencyMultiset.add(id);
            Entity entity = em.getEntity();
            words.add(entity.getShingle().getText());
            String type = entity.getPage().getType();
            if (type != null && type.length() > 0) {
                typeMultiset.add(type);
            }
            String domain = entity.getPage().getDomain();
            if (domain != null && domain.length() > 0) {
                domainMultiset.add(domain);
            }

            String desc = entity.getPage().getDescription();
            List<Token> tokens = Zemberek.getInstance().disambiguateFindTokens(desc, false, true);
            for (Token token : tokens) {
                leskWords.add(token.getMorphText());
            }

        } else {
            entityFrequencyMultiset.add(id);
        }
    }

    int maxDomainCount = 0;
    for (String domain : Multisets.copyHighestCountFirst(domainMultiset).elementSet()) {
        maxDomainCount = domainMultiset.count(domain);
        break;
    }
    int maxTypeCount = 0;
    for (String type : Multisets.copyHighestCountFirst(typeMultiset).elementSet()) {
        maxTypeCount = typeMultiset.count(type);
        break;
    }

    double maxSuffixScore = 0, maxLeskScore = 0, maxSimpleLeskScore = 0, maxLinkScore = 0,
            maxHashInfoboxScore = 0, maxwordvecDescriptionLocalScore = 0, maxHashDescriptionScore = 0,
            maxPopularityScore = 0, maxWordvectorAverage = 0, maxWordvecLinksScore = 0;
    // second pass compute similarities between entities in a window
    int currentSpotIndex = -1;
    SpotMatch currentSpot = null;
    for (int i = 0; i < entities.size(); i++) {
        EntityMatch em = entities.get(i);
        SpotMatch spot = em.getSpot();
        if (currentSpot == null || spot != currentSpot) {
            currentSpotIndex++;
            currentSpot = spot;
        }

        String id = em.getId();
        Entity entity = entities.get(i).getEntity();
        EntityPage page = entities.get(i).getEntity().getPage();
        String domain = page.getDomain();
        String type = page.getType();
        Shingle shingle = entity.getShingle();

        /* windowing algorithms stars */
        int left = currentSpotIndex - window;
        int right = currentSpotIndex + window;
        if (left < 0) {
            right -= left;
            left = 0;
        }
        if (right > sml.size()) {
            left += (sml.size()) - right;
            right = sml.size();
            if (left < 0) {
                left = 0;
            }
        }

        double linkScore = 0, hashInfoboxScore = 0, wordvecDescriptionLocalScore = 0, hashDescriptionScore = 0,
                wordvecLinksScore = 0;
        for (int j = left; j < right; j++) {
            SpotMatch sm2 = sml.get(j);
            EntityMatchList entities2 = sm2.getEntities();
            for (EntityMatch em2 : entities2) {
                String id2 = em2.getId();
                EntityPage page2 = em2.getEntity().getPage();
                int counter = 0;
                if (!ambigious.equals("true")) {
                    for (EntityMatch entityMatch : entities2) {
                        if (entityMatch.getId().startsWith("w")) {
                            counter++;
                        }
                    }
                }

                if ((ambigious.equals("true") || counter == 1) && em.getSpot() != em2.getSpot()
                        && !id.equals(id2)) {
                    // Link Similarity calculation starts
                    double linkSim = 0;
                    if (id.startsWith("w") && id2.startsWith("w")) {
                        if (entitySimMap.containsKey("link" + id + id2)) {
                            linkSim = entitySimMap.get("link" + id + id2);
                        } else {
                            HashSet<String> set1 = Sets.newHashSet(page.getLinks().split(" "));
                            HashSet<String> set2 = Sets.newHashSet(page2.getLinks().split(" "));
                            linkSim = JaccardCalculator.calculateSimilarity(set1, set2);
                            entitySimMap.put("link" + id + id2, linkSim);
                        }
                        linkScore += linkSim;
                        // Link Similarity calculation ends
                    }
                    // Entity embedding similarity calculation starts
                    double eeSim = 0;
                    if (id.startsWith("w") && id2.startsWith("w")) {
                        if (entitySimMap.containsKey("ee" + id + id2)) {
                            eeSim = entitySimMap.get("ee" + id + id2);
                        } else {
                            eeSim = EntityEmbeddingSimilarity.getInstance().getSimilarity(page, page2);
                            entitySimMap.put("ee" + id + id2, eeSim);
                        }
                        hashInfoboxScore += eeSim;
                    }
                    double w2veclinksSim = 0;
                    if (id.startsWith("w") && id2.startsWith("w")) {
                        if (entitySimMap.containsKey("wl" + id + id2)) {
                            w2veclinksSim = entitySimMap.get("wl" + id + id2);
                        } else {
                            w2veclinksSim = AveragePooling.getInstance().getSimilarity(page.getWord2vec(),
                                    page2.getWord2vec());
                            entitySimMap.put("wl" + id + id2, w2veclinksSim);
                        }
                        wordvecLinksScore += w2veclinksSim;
                    }

                    // Entity embedding similarity calculation ends

                    // Description word2vec similarity calculation
                    // starts
                    double word2vecSim = 0;

                    if (entitySimMap.containsKey("w2v" + id + id2)) {
                        word2vecSim = entitySimMap.get("w2v" + id + id2);
                    } else {
                        word2vecSim = AveragePooling.getInstance().getSimilarity(page2.getDword2vec(),
                                page.getDword2vec());
                        entitySimMap.put("w2v" + id + id2, word2vecSim);
                    }
                    wordvecDescriptionLocalScore += word2vecSim;
                    // Description word2vec similarity calculation ends

                    // Description autoencoder similarity calculation
                    // starts
                    double autoVecSim = 0;

                    if (entitySimMap.containsKey("a2v" + id + id2)) {
                        autoVecSim = entitySimMap.get("a2v" + id + id2);
                    } else {
                        autoVecSim = AveragePooling.getInstance().getSimilarity(page2.getDautoencoder(),
                                page.getDautoencoder());
                        entitySimMap.put("a2v" + id + id2, autoVecSim);
                    }
                    hashDescriptionScore += autoVecSim;
                    // Description autoencoder similarity calculation
                    // ends

                }
            }
        }
        if (linkScore > maxLinkScore) {
            maxLinkScore = linkScore;
        }
        if (hashInfoboxScore > maxHashInfoboxScore) {
            maxHashInfoboxScore = hashInfoboxScore;
        }
        if (wordvecDescriptionLocalScore > maxwordvecDescriptionLocalScore) {
            maxwordvecDescriptionLocalScore = wordvecDescriptionLocalScore;
        }
        if (hashDescriptionScore > maxHashDescriptionScore) {
            maxHashDescriptionScore = hashDescriptionScore;
        }
        if (wordvecLinksScore > maxWordvecLinksScore) {
            maxWordvecLinksScore = wordvecLinksScore;
        }

        /* windowing algorithms ends */

        double domainScore = 0;
        if (domainMultiset.size() > 0 && maxDomainCount > 1 && domainMultiset.count(domain) > 1) {
            domainScore = (double) domainMultiset.count(domain) / maxDomainCount;
        }
        double typeScore = 0;
        if (typeMultiset.size() > 0 && maxTypeCount > 1 && typeMultiset.count(type) > 1) {
            typeScore = (double) typeMultiset.count(type) / maxTypeCount;
        }
        if (typeBlackList.contains(type)) {
            typeScore /= 10;
        }

        double typeContentScore = 0;
        if (type.length() > 0 && StringUtils.containsIgnoreCase(words.toString(), type)) {
            typeContentScore = 1;
        }

        double typeClassifierScore = TypeClassifier.getInstance().predict(page, page.getTitle(), page.getType(),
                entity.getShingle().getSentence());

        double wordvecDescriptionScore = AveragePooling.getInstance().getSimilarity(documentVector,
                page.getDword2vec());
        if (wordvecDescriptionScore > maxWordvectorAverage) {
            maxWordvectorAverage = wordvecDescriptionScore;
        }

        double suffixScore = 0;

        if (type != null && type.length() > 0) {
            Set<String> suffixes = new HashSet<String>();
            String t = entity.getTitle().toLowerCase(new Locale("tr", "TR"));

            for (int x = 0; x < entities.size(); x++) {
                EntityMatch e2 = entities.get(x);
                if (e2.getId().equals(entity.getId())) {
                    suffixes.add(e2.getMention());
                }
            }
            suffixes.remove(t);
            suffixes.remove(entity.getTitle());
            // String inputTextLower = inputText.toLowerCase(new
            // Locale("tr",
            // "TR"));
            // while (inputTextLower.contains(t)) {
            // int start = inputTextLower.indexOf(t);
            // int end = inputTextLower.indexOf(" ", start + t.length());
            // if (end > start) {
            // String suffix = inputTextLower.substring(start, end);
            // // .replaceAll("\\W", "");
            // if (suffix.contains("'")
            // || (Zemberek.getInstance().hasMorph(suffix)
            // && !suffix.equals(t) && suffix.length() > 4)) {
            // suffixes.add(suffix);
            // }
            // inputTextLower = inputTextLower.substring(end);
            // } else {
            // break;
            // }
            // }
            if (suffixes.size() >= minSuffix) {
                for (String suffix : suffixes) {
                    double sim = gd.calculateSimilarity(suffix, type);
                    suffixScore += sim;
                }
            }
        }

        // String entitySuffix = page.getSuffix();
        // String[] inputSuffix = shingle.getSuffix().split(" ");
        // for (int j = 0; j < inputSuffix.length; j++) {
        // if (entitySuffix.contains(inputSuffix[j])) {
        // suffixScore += 0.25f;
        // }
        // }

        if (suffixScore > maxSuffixScore) {
            maxSuffixScore = suffixScore;
        }
        // if (id.equals("w691538")) {
        // LOGGER.info("");
        // }
        double letterCaseScore = 0;
        int lc = page.getLetterCase();
        if (StringUtils.isAllLowerCase(em.getMention()) && lc == 0 && id.startsWith("t")) {
            letterCaseScore = 1;
        } else if (StringUtils.isAllUpperCase(em.getMention()) && lc == 1 && id.startsWith("w")) {
            letterCaseScore = 1;
        } else if (Character.isUpperCase(em.getMention().charAt(0)) && lc == 2 && id.startsWith("w")) {
            letterCaseScore = 1;
        } else if (StringUtils.isAllLowerCase(em.getMention()) && id.startsWith("t")) {
            letterCaseScore = 1;
        }

        double nameScore = 1 - LevenshteinDistanceCalculator.calculateDistance(page.getTitle(),
                Zemberek.removeAfterSpostrophe(em.getMention()));

        double popularityScore = page.getRank();
        if (id.startsWith("w")) {
            popularityScore = Math.log10(popularityScore + 1);
            if (popularityScore > maxPopularityScore) {
                maxPopularityScore = popularityScore;
            }
        }

        double leskScore = 0, simpleLeskScore = 0;

        String desc = em.getEntity().getPage().getDescription();
        if (desc != null) {
            List<Token> tokens = Zemberek.getInstance().disambiguateFindTokens(desc, false, true);
            for (Token token : tokens) {
                if (inputTokensMultiset.contains(token.getMorphText())
                        && !TurkishNLP.isStopWord(token.getMorphText())) {
                    simpleLeskScore += inputTokensMultiset.count(token.getMorphText());
                }
                if (leskWords.contains(token.getMorphText()) && !TurkishNLP.isStopWord(token.getMorphText())) {
                    leskScore += leskWords.count(token.getMorphText());
                }

            }
            leskScore /= Math.log(tokens.size() + 1);
            simpleLeskScore /= Math.log(tokens.size() + 1);
            if (leskScore > maxLeskScore) {
                maxLeskScore = leskScore;
            }
            if (simpleLeskScore > maxSimpleLeskScore) {
                maxSimpleLeskScore = simpleLeskScore;
            }

            if (!entityScoreMap.containsKey(id)) {
                EntityScores scores = new EntityScores(em, id, popularityScore, nameScore, letterCaseScore,
                        suffixScore, wordvecDescriptionScore, typeContentScore, typeScore, domainScore,
                        hashDescriptionScore, wordvecDescriptionLocalScore, hashInfoboxScore, linkScore,
                        wordvecLinksScore, leskScore, simpleLeskScore, typeClassifierScore);
                entityScoreMap.put(id, scores);
            } else {
                EntityScores entityScores = entityScoreMap.get(id);
                entityScores.setHashInfoboxScore((entityScores.getHashInfoboxScore() + hashInfoboxScore) / 2);
                entityScores.setHashDescriptionScore(
                        (entityScores.getHashInfoboxScore() + hashDescriptionScore) / 2);
                entityScores.setLinkScore((entityScores.getLinkScore() + linkScore) / 2);
                entityScores.setWordvecDescriptionLocalScore(
                        (entityScores.getWordvecDescriptionLocalScore() + wordvecDescriptionLocalScore) / 2);
                entityScores
                        .setWordvecLinksScore((entityScores.getWordvecLinksScore() + wordvecLinksScore) / 2);
                entityScores.setLeskScore((entityScores.getLeskScore() + leskScore) / 2);

            }

        }
    }
    /* normalization and total score calculation starts */
    Set<String> set = new HashSet<String>();
    for (int i = 0; i < entities.size(); i++) {
        EntityMatch em = entities.get(i);
        String id = em.getId();
        EntityScores entityScores = entityScoreMap.get(id);
        if (set.contains(id)) {
            continue;
        }
        if (id.startsWith("w")) {
            if (maxLinkScore > 0 && entityScores.getLinkScore() > 0) {
                entityScores.setLinkScore(entityScores.getLinkScore() / maxLinkScore);
            }
            if (maxHashInfoboxScore > 0 && entityScores.getHashInfoboxScore() > 0) {
                entityScores.setHashInfoboxScore(entityScores.getHashInfoboxScore() / maxHashInfoboxScore);
            }
            if (maxWordvecLinksScore > 0 && entityScores.getWordvecLinksScore() > 0) {
                entityScores.setWordvecLinksScore(entityScores.getWordvecLinksScore() / maxWordvecLinksScore);
            }
            if (maxPopularityScore > 0 && entityScores.getPopularityScore() > 0) {
                entityScores.setPopularityScore(entityScores.getPopularityScore() / maxPopularityScore);
            }
        }
        if (maxwordvecDescriptionLocalScore > 0 && entityScores.getWordvecDescriptionLocalScore() > 0) {
            entityScores.setWordvecDescriptionLocalScore(
                    entityScores.getWordvecDescriptionLocalScore() / maxwordvecDescriptionLocalScore);
        }
        if (maxHashDescriptionScore > 0 && entityScores.getHashDescriptionScore() > 0) {
            entityScores
                    .setHashDescriptionScore(entityScores.getHashDescriptionScore() / maxHashDescriptionScore);
        }
        if (maxWordvectorAverage > 0 && entityScores.getWordvecDescriptionScore() > 0) {
            entityScores.setWordvecDescriptionScore(
                    entityScores.getWordvecDescriptionScore() / maxWordvectorAverage);
        }
        if (maxLeskScore > 0 && entityScores.getLeskScore() > 0) {
            entityScores.setLeskScore(entityScores.getLeskScore() / maxLeskScore);
        }
        if (maxSimpleLeskScore > 0 && entityScores.getSimpleLeskScore() > 0) {
            entityScores.setSimpleLeskScore(entityScores.getSimpleLeskScore() / maxSimpleLeskScore);
        }
        if (maxSuffixScore > 0 && entityScores.getSuffixScore() > 0) {
            entityScores.setSuffixScore(entityScores.getSuffixScore() / maxSuffixScore);
        }
        set.add(id);
    }

    LOGGER.info("\t"
            + "id\tTitle\tURL\tScore\tPopularity\tName\tLesk\tSimpeLesk\tCase\tNoun\tSuffix\tTypeContent\tType\tDomain\twordvecDescription\twordvecDescriptionLocal\thashDescription\thashInfobox\tword2vecLinks\tLink\t\ttypeClassifier\tDescription");
    for (int i = 0; i < entities.size(); i++) {
        EntityMatch em = entities.get(i);
        String id = em.getId();
        EntityScores e = entityScoreMap.get(id);
        double wikiScore = 0;
        if (id.startsWith("w") && Character.isUpperCase(em.getMention().charAt(0))) {
            wikiScore = wikiWeight;
        } else if (id.startsWith("t") && Character.isLowerCase(em.getMention().charAt(0))) {
            wikiScore = wikiWeight;
        }
        // if(id.equals("w508792")){
        // LOGGER.info("");
        // }
        double totalScore = wikiScore + e.getPopularityScore() * popularityWeight
                + e.getNameScore() * nameWeight + e.getLeskScore() * leskWeight
                + e.getSimpleLeskScore() * simpleLeskWeight + e.getLetterCaseScore() * letterCaseWeight
                + e.getSuffixScore() * suffixWeight + e.getTypeContentScore() * typeContentWeight
                + e.getTypeScore() * typeWeight + e.getDomainScore() * domainWeight
                + e.getWordvecDescriptionScore() * wordvecDescriptionWeight
                + e.getWordvecDescriptionLocalScore() * wordvecDescriptionLocalWeight
                + e.getHashDescriptionScore() * hashDescriptionWeight
                + e.getHashInfoboxScore() * hashInfoboxWeight + e.getWordvecLinksScore() * word2vecLinksWeight
                + e.getLinkScore() * linkWeight + e.getTypeClassifierkScore() * typeClassifierkWeight;
        if (ranklib == true) {
            totalScore = RankLib.getInstance().score(e);
        }

        if (em.getEntity().getPage().getUrlTitle().contains("(")) {
            totalScore /= 2;
        }
        em.setScore(totalScore);
        e.setScore(totalScore);

        LOGGER.info("\t" + id + "\t" + em.getEntity().getPage().getTitle() + "\t"
                + em.getEntity().getPage().getUrlTitle() + "\t" + em.getScore() + "\t"
                + e.getPopularityScore() * popularityWeight + "\t" + e.getNameScore() * nameWeight + "\t"
                + e.getLeskScore() * leskWeight + "\t" + e.getSimpleLeskScore() * simpleLeskWeight + "\t"
                + e.getLetterCaseScore() * letterCaseWeight + "\t" + e.getSuffixScore() * suffixWeight + "\t"
                + e.getTypeContentScore() * typeContentWeight + "\t" + e.getTypeScore() * typeWeight + "\t"
                + e.getDomainScore() * domainWeight + "\t"
                + e.getWordvecDescriptionScore() * wordvecDescriptionWeight + "\t"
                + e.getWordvecDescriptionLocalScore() * wordvecDescriptionLocalWeight + "\t"
                + e.getHashDescriptionScore() * hashDescriptionWeight + "\t"
                + e.getHashInfoboxScore() * hashInfoboxWeight + "\t"
                + e.getWordvecLinksScore() * word2vecLinksWeight + "\t" + e.getLinkScore() * linkWeight + "\t"
                + e.getTypeClassifierkScore() * typeClassifierkWeight + "\t"
                + em.getEntity().getPage().getDescription());
    }

    // if (annotateEntities) {
    // annotateEntities(localParams.getParams().get("originalText"), sml);
    // }

    EntityMatchList eml = new EntityMatchList();
    for (SpotMatch match : sml) {
        EntityMatchList list = match.getEntities();
        if (!list.isEmpty()) {
            list.sort();
            eml.add(list.get(0));
            selectedEntities.add(list.get(0).getId());
        }
    }
    return eml;
}

From source file:models.persistence.lecture.Lecture.java

@JsonIgnore
public String getShortName() {
    StringBuilder sb = new StringBuilder();

    for (int i = 0; i < name.length(); i++) {
        if (Character.isUpperCase(name.charAt(i)) || Character.isDigit(name.charAt(i)) || name.charAt(i) == '/'
                || name.charAt(i) == ' ' || name.charAt(i) == '+' || name.charAt(i) == '-') {
            sb.append(name.charAt(i));//from   w w  w. j av a  2s.com

            if (Character.isUpperCase(name.charAt(i))) {
                for (int j = i; j < i + 3 && j < name.length(); j++) {
                    if (Character.isLowerCase(name.charAt(j))) {
                        sb.append(name.charAt(j));
                    }
                }
            }
        }
    }

    return sb.toString().replaceAll("  ", "").replaceAll("AE", "").replaceAll("OE", "")
            .replaceAll("UE", "").trim();
    //return sb.toString().replaceAll("","AE").replaceAll("","OE").replaceAll("","UE").trim();
}

From source file:com.svi.uzabase.logic.ValidationProcess.java

private List<XMLHolder> validateData(List<XMLHolder> xmlBatchHolder) {
    try {//  w  ww. j  a v  a 2s  . c  om
        int totalCounter = 0;
        //Initialize dictionary
        String dictFileName = "file://./dic/english.jar";
        String configFile = "file://./classes/spellCheck.config";
        BasicDictionary dictionary = new BasicDictionary(dictFileName);
        SpellCheckConfiguration configuration = new SpellCheckConfiguration(configFile);
        BasicSuggester suggester = new BasicSuggester(configuration);
        suggester.attach(dictionary);

        // create SpellCheck object based on configuration and specify Suggester
        SpellCheck spellCheck = new SpellCheck(configuration);
        spellCheck.setSuggester(suggester);
        //set for jprogress bar
        for (XMLHolder h : xmlBatchHolder) {
            totalCounter += h.size();

        }
        progress = new AtomicInteger(0);
        total = new AtomicInteger(totalCounter);
        mf.setJprogressValues(total, progress);
        //validation process begins here
        String[] invalidWords = { "corporation", "inc.", "city", "corp.", "st.", "co.", "ltd." };
        String[] invalidBoardWords = { "other", "oth" };
        String[] validWords = { "loc", "to", "ext", "local" };
        String[] invalidCharacters = { ",", "/", "\\", "[", "]", "\"", ":", "^", "{", "}", "%", "+", "#", "(",
                ")" };
        String[] splitter;
        String tempURL;
        SimpleDateFormat sdf = new SimpleDateFormat("YYYY/MM/DD");
        SimpleDateFormat fiscalYear = new SimpleDateFormat("MM/DD");
        sdf.setLenient(false);
        Set<String> officerList = new HashSet<>();
        List<Double> percentOwnership = new ArrayList<>();
        UrlValidator urlValidator = new UrlValidator();
        Date date = null;
        for (XMLHolder h : xmlBatchHolder) {
            for (Field f : h) {
                mf.loader("Validating fields: ", false);
                if (!f.getType().equals("none") && !f.getValue().equals("*N/A")) {
                    switch (f.getType()) {
                    case "city":
                        if (f.getValue().isEmpty() || f.getValue().equals("")) {
                            f.add("Address is empty");
                        } else {
                            if (hasWhiteSpaceTrailing(f.getValue())) {
                                f.add("has trailing white space ");
                            }
                            if (cityList.indexOf(f.getValue()) < 0) {
                                f.add("City not found on list!");
                            }
                        }

                        break;
                    case "province":
                        if (f.getValue().isEmpty() || f.getValue().equals("")) {
                            f.add("Address is empty");
                        } else {
                            if (hasWhiteSpaceTrailing(f.getValue())) {
                                f.add("has trailing white space ");
                            }
                            if (provinceList.indexOf(f.getValue()) < 0) {
                                f.add("Province not found on list!");
                            }
                        }

                        break;
                    case "tel":
                        if (!f.getValue().isEmpty() || !f.getValue().equals("")) {
                            //                                    if (f.getValue().matches("[a-z A-Z]+")) {
                            if (f.getValue().matches(".*[a-zA-Z]+.*")) {
                                for (String s : validWords) {
                                    if (!f.getValue().contains(s)) {
                                        f.add("Invalid telephone number");
                                    }
                                }
                            }

                            if (f.getValue().replace(" ", "").replace("-", "").length() < 7
                                    || f.getValue().replace(" ", "").replace("-", "").length() > 8) {
                                f.add("Invalid telephone number length");
                            }

                            if (hasWhiteSpaceTrailing(f.getValue())) {
                                f.add("has trailing white space ");
                            }
                            if (StringUtils.countMatches(f.getValue(), "-") > 2) {
                                f.add("Invalid telephone number");
                            }

                            for (String c : invalidCharacters) {
                                if (f.getValue().contains(c)) {
                                    f.add("Contains invalid character [ " + c + " ]");
                                    break;
                                }
                            }
                        }
                        break;
                    case "fax":
                        if (!f.getValue().isEmpty() || !f.getValue().equals("")) {
                            //                                    if (f.getValue().matches("[a-z A-Z]+")) {
                            if (f.getValue().matches(".*[a-zA-Z]+.*")) {
                                for (String s : validWords) {
                                    if (!f.getValue().contains(s)) {
                                        f.add("Invalid fax number");
                                    }
                                }
                            }
                            if (f.getValue().replace(" ", "").length() < 6) {
                                f.add("Invalid fax number");
                            }
                            if (StringUtils.countMatches(f.getValue(), "-") > 1) {
                                f.add("Invalid fax number");
                            }
                            if (hasWhiteSpaceTrailing(f.getValue())) {
                                f.add("has trailing white space ");
                            }
                            for (String c : invalidCharacters) {
                                if (f.getValue().contains(c)) {
                                    f.add("Contains invalid character [ " + c + " ]");
                                    break;
                                }
                            }
                        }
                        break;
                    case "person":
                        if (!f.getValue().isEmpty() || !f.getValue().equals("")) {
                            if (!f.getValue().matches("[a-zA-Z\\.,\\- ()]+")) {
                                f.add("Invalid name");
                            }
                            if (f.getValue().matches("[a-z ]+")) {
                                f.add("All small caps");
                            }
                            if (f.getValue().matches("\\w+")) {
                                f.add("Only one word");
                            }
                            if (f.getValue().replace(" ", "").length() > 30) {
                                f.add("More than 30 characters.");
                            }
                            if (f.getValue().replace(" ", "").length() < 2) {
                                f.add("Invalid name.");
                            }
                            if (hasWhiteSpaceTrailing(f.getValue())) {
                                f.add("has trailing white space ");
                            }
                        }
                        break;
                    case "email":
                        if (!f.getValue().isEmpty() || !f.getValue().equals("")) {
                            if (!EmailValidator.getInstance(true).isValid(f.getValue())) {
                                f.add("Invalid email");
                            }
                        }
                        break;
                    case "website":
                        if (!f.getValue().isEmpty() || !f.getValue().equals("")) {
                            if (!f.getValue().contains("http")) {
                                tempURL = "http://" + f.getValue();
                            } else {
                                tempURL = f.getValue();
                            }
                            if (!urlValidator.isValid(tempURL)) {
                                f.add("Invalid website");
                            }
                            if (hasWhiteSpaceTrailing(f.getValue())) {
                                f.add("has trailing white space ");
                            }
                        }
                        break;
                    case "name":
                        officerList.add(f.getValue());
                        if (!f.getValue().isEmpty() || !f.getValue().equals("")) {
                            if (!f.getValue().matches("[a-zA-Z\\.,\\-() ]+")) {
                                f.add("Invalid name");
                            }
                            if (f.getValue().replace(" ", "").length() > 30) {
                                f.add("More than 50 characters.");
                            }
                            if (f.getValue().matches("[a-z ]+")) {
                                f.add("All small caps");
                            }
                            if (f.getValue().matches("\\w+")) {
                                f.add("Only one word");
                            }
                            if (f.getValue().replace(" ", "").length() < 2) {
                                f.add("Invalid name.");
                            }
                            if (hasWhiteSpaceTrailing(f.getValue())) {
                                f.add("has trailing white space ");
                            }
                            for (String s : invalidWords) {
                                if (f.getValue().contains(s)) {
                                    f.add("Contains invalid word: " + s);
                                    break;
                                }
                            }
                        }
                        break;
                    case "stockholder":
                        officerList.add(f.getValue());
                        if (!f.getValue().isEmpty() || !f.getValue().equals("")) {
                            if (!f.getValue().matches("[a-zA-Z\\.,\\-() ]+")) {
                                f.add("Invalid name");
                            }
                            if (f.getValue().replace(" ", "").length() > 30) {
                                f.add("More than 50 characters.");
                            }
                            if (f.getValue().matches("[a-z ]+")) {
                                f.add("All small caps");
                            }
                            if (f.getValue().matches("\\w+")) {
                                f.add("Only one word");
                            }
                            if (f.getValue().replace(" ", "").length() < 2) {
                                f.add("Invalid name.");
                            }
                            if (hasWhiteSpaceTrailing(f.getValue())) {
                                f.add("has trailing white space ");
                            }
                            for (String s : invalidWords) {
                                if (f.getValue().contains(s)) {
                                    f.add("Contains invalid word: " + s);
                                    break;
                                }
                            }
                        }
                        break;
                    case "board":
                        if (!f.getValue().isEmpty() || !f.getValue().equals("")) {
                            if (!f.getValue().matches("[a-zA-Z\\.,\\-() ]+")) {
                                f.add("Invalid position");
                            }
                            for (String c : invalidCharacters) {
                                if (f.getValue().contains(c)) {
                                    f.add("Contains invalid character [ " + c + " ]");
                                    break;
                                }
                            }
                            for (String c : invalidBoardWords) {
                                if (f.getValue().contains(c)) {
                                    f.add("Contains invalid word [ " + c + " ]");
                                    break;
                                }
                            }

                            if (f.getValue().equalsIgnoreCase("N") || f.getValue().equalsIgnoreCase("Y")) {
                                f.add("is letter " + f.getValue() + " only");
                            }
                            if (Character.isLowerCase(f.getValue().charAt(0))) {
                                f.add("starts with a lower case letter");
                            }

                            spellCheck.setText(f.getValue(), Constants.DOC_TYPE_TEXT, "en");
                            spellCheck.check();
                            if (spellCheck.hasMisspelt()) {
                                f.add("word is misspelled.");
                            }

                        }
                        break;
                    case "corporation":
                        if (companyList.indexOf(f.getValue().toUpperCase()) < 0) {
                            f.add("Company name not found on table.");
                        }
                        break;
                    case "sec":
                        if (StringUtils.countMatches(f.getValue(), "-") > 1) {
                            f.add("Invalid SEC number");
                        }
                        if (f.getValue().replace(" ", "").length() > 9) {
                            f.add("SEC number more than 9 digits.");
                        }
                        if (hasWhiteSpaceTrailing(f.getValue())) {
                            f.add("SEC has trailing white space.");
                        }
                        for (String c : invalidCharacters) {
                            if (f.getValue().contains(c)) {
                                f.add("Contains invalid character [ " + c + " ]");
                                break;
                            }
                        }
                        break;
                    case "tin":
                        if (f.getValue().isEmpty() || f.getValue().equals("")) {
                            f.add("TIN is empty");
                        }
                        if (hasWhiteSpaceTrailing(f.getValue())) {
                            f.add("TIN has trailing white space.");
                        }
                        if (!f.getValue().matches("[0-9]+")) {
                            f.add("invalid TIN number");
                        }
                        if (f.getValue().replace(" ", "").replace("-", "").length() > 12
                                || f.getValue().replace(" ", "").replace("-", "").length() < 9) {
                            f.add("TIN number invalid length.");
                        }
                        if (StringUtils.countMatches(f.getValue(), "-") > 1) {
                            f.add("Invalid TIN number");
                        }
                        for (String c : invalidCharacters) {
                            if (f.getValue().contains(c)) {
                                f.add("Contains invalid character [ " + c + " ]");
                                break;
                            }
                        }
                        break;
                    case "nationality":
                        if (!f.getValue().isEmpty() || !f.getValue().equals("")) {
                            if (nationalityList.indexOf(f.getValue()) < 0) {
                                f.add("nationality is misspelled.");
                            }
                        }
                        break;
                    case "purpose":
                        splitter = f.getValue().split(" ");
                        for (int i = 0; i < splitter.length; i++) {
                            spellCheck.setText(splitter[i], Constants.DOC_TYPE_TEXT, "en");
                            spellCheck.check();
                            if (spellCheck.hasMisspelt()) {
                                f.add("word is misspelled. ( " + spellCheck.getMisspelt() + " )");
                            }
                        }
                        break;
                    case "periodCovered":
                        try {
                            date = sdf.parse(f.getValue());
                            if (!f.getValue().equals(sdf.format(date))) {
                                f.add("Invalid date format");
                            }
                        } catch (ParseException ex) {
                            f.add("Invalid date format");
                        }
                        break;
                    case "fiscalYear":
                        try {
                            date = fiscalYear.parse(f.getValue());
                            if (!f.getValue().equals(sdf.format(date))) {
                                f.add("Invalid date format");
                            }
                        } catch (ParseException ex) {
                            f.add("Invalid date format");
                        }
                        break;
                    case "position":
                        if (f.getValue().contains("\\d+")) {
                            f.add("Invalid position/designation");
                        }
                        if (f.getValue().replace(" ", "").length() > 10
                                || f.getValue().replace(" ", "").length() < 3) {
                            f.add("More than 30 characters.");
                        }
                        break;
                    case "shareType":
                        if (f.getValue().toLowerCase().contains("total")) {
                            f.add("Share type contains total.");
                        }

                        if (f.getValue().replace(" ", "").length() > 20) {
                            f.add("Share type More than 20 characters.");
                        }
                        break;
                    case "ownership":
                        percentOwnership.add(Double.parseDouble(f.getValue()));
                        if (Double.parseDouble(f.getValue()) > 100) {
                            f.add("Percent ownership more than 100%");
                        }
                        break;
                    default:
                        break;
                    }
                } else if (f.getType().equals("tin") && f.getValue().equals("*N/A")) {
                    f.add("TIN is N/A");
                }
            }
        }

    } catch (EncryptedDocumentException | SuggesterException ex) {
        Logger.getLogger(ValidationProcess.class.getName()).log(Level.SEVERE, null, ex);
    }
    return xmlBatchHolder;
}

From source file:net.yacy.cora.document.id.MultiProtocolURL.java

/**
 * <p>//  w  w  w.  jav a  2 s .  co m
 * Percent-encode/escape an URL path part according to the allowed characters
 * specified in RFC3986 (formerly RFC1738 and RFC2396). Uses UTF-8 character
 * codes for non-ASCII.
 * </p>
 * <p>
 * When isPattern is true, the string is processed as a regular expression, and
 * therefore meta-characters used by the {@link Pattern} class are not
 * percent-encoded.
 * </p>
 * 
 * @param pathToEscape the path part to escape.
 * @param isPattern    when true, regular meta-characters are not escaped
 * @return an escaped path regular expression with only allowed ASCII
 *         characters, or null when pathPattern is null.
 * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
 *      percent-encoding section</a>
 * @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
 *      definition</a>
 */
private static String escapePath(final String pathToEscape, final boolean isPattern) {
    if (pathToEscape == null) {
        return pathToEscape;
    }
    final StringBuilder ptmp = new StringBuilder(pathToEscape.length() + 10);
    boolean modified = false;
    final int len = pathToEscape.length();
    int i = 0;
    while (i < len) {
        int ch = pathToEscape.charAt(i);
        if (ch == '%' && (i + 2) < len) {
            final char digit1 = pathToEscape.charAt(i + 1);
            final char digit2 = pathToEscape.charAt(i + 2);
            if (isHexDigit(digit1) && isHexDigit(digit2)) {
                /* Already percent-encoded character */
                ptmp.append((char) ch);
                /* Normalize hexadecimal digits to upper case */
                if (Character.isLowerCase(digit1) || Character.isLowerCase(digit2)) {
                    modified = true;
                }
                ptmp.append(Character.toUpperCase(digit1));
                ptmp.append(Character.toUpperCase(digit2));
                i += 2;
            } else {
                /* Not a valid percent-encoded character : we encode it now */
                ptmp.append(hex[ch]);
                modified = true;
            }
        } else if (isPattern && PATTERN_METACHARACTERS.get(ch)) {
            ptmp.append((char) ch);
        } else if (ch <= 0x7F) {
            if (UNRESERVED_PATH.get(ch)) {
                ptmp.append((char) ch);
            } else {
                ptmp.append(hex[ch]);
                modified = true;
            }
        } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
            ptmp.append(hex[0xc0 | (ch >> 6)]);
            ptmp.append(hex[0x80 | (ch & 0x3F)]);
            modified = true;
        } else { // 0x7FF < ch <= 0xFFFF
            ptmp.append(hex[0xe0 | (ch >> 12)]);
            ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
            ptmp.append(hex[0x80 | (ch & 0x3F)]);
            modified = true;
        }
        i++;
    }

    if (modified) {
        return ptmp.toString();
    }
    return pathToEscape;
}

From source file:org.gvnix.service.roo.addon.addon.util.WsdlParserUtils.java

/**
 * Capitalize the first character of the name.
 * //from  ww w .  j  a  v  a 2 s . c  om
 * @param name
 * @return
 */
public static String capitalizeFirstChar(String name) {

    if ((name == null) || name.equals("")) {
        return name;
    }

    char start = name.charAt(0);

    if (Character.isLowerCase(start)) {
        start = Character.toUpperCase(start);

        return start + name.substring(1);
    }

    return name;
}

From source file:org.apache.axis.utils.JavaUtils.java

/**
 * Map an XML name to a Java identifier per
 * the mapping rules of JSR 101 (in version 1.0 this is
 * "Chapter 20: Appendix: Mapping of XML Names"
 * //  w ww. j a v  a2 s.  c o  m
 * @param name is the xml name
 * @return the java name per JSR 101 specification
 */
public static String xmlNameToJava(String name) {
    // protect ourselves from garbage
    if (name == null || name.equals(""))
        return name;

    char[] nameArray = name.toCharArray();
    int nameLen = name.length();
    StringBuffer result = new StringBuffer(nameLen);
    boolean wordStart = false;

    // The mapping indicates to convert first character.
    int i = 0;
    while (i < nameLen && (isPunctuation(nameArray[i]) || !Character.isJavaIdentifierStart(nameArray[i]))) {
        i++;
    }
    if (i < nameLen) {
        // Decapitalization code used to be here, but we use the
        // Introspector function now after we filter out all bad chars.

        result.append(nameArray[i]);
        //wordStart = !Character.isLetter(nameArray[i]);
        wordStart = !Character.isLetter(nameArray[i]) && nameArray[i] != "_".charAt(0);
    } else {
        // The identifier cannot be mapped strictly according to
        // JSR 101
        if (Character.isJavaIdentifierPart(nameArray[0])) {
            result.append("_" + nameArray[0]);
        } else {
            // The XML identifier does not contain any characters
            // we can map to Java.  Using the length of the string
            // will make it somewhat unique.
            result.append("_" + nameArray.length);
        }
    }

    // The mapping indicates to skip over
    // all characters that are not letters or
    // digits.  The first letter/digit
    // following a skipped character is
    // upper-cased.
    for (++i; i < nameLen; ++i) {
        char c = nameArray[i];

        // if this is a bad char, skip it and remember to capitalize next
        // good character we encounter
        if (isPunctuation(c) || !Character.isJavaIdentifierPart(c)) {
            wordStart = true;
            continue;
        }
        if (wordStart && Character.isLowerCase(c)) {
            result.append(Character.toUpperCase(c));
        } else {
            result.append(c);
        }
        // If c is not a character, but is a legal Java
        // identifier character, capitalize the next character.
        // For example:  "22hi" becomes "22Hi"
        //wordStart = !Character.isLetter(c);
        wordStart = !Character.isLetter(c) && c != "_".charAt(0);
    }

    // covert back to a String
    String newName = result.toString();

    // Follow JavaBean rules, but we need to check if the first 
    // letter is uppercase first
    if (Character.isUpperCase(newName.charAt(0)))
        newName = Introspector.decapitalize(newName);

    // check for Java keywords
    if (isJavaKeyword(newName))
        newName = makeNonJavaKeyword(newName);

    return newName;
}

From source file:org.languagetool.rules.de.CaseRule.java

private void potentiallyAddLowercaseMatch(List<RuleMatch> ruleMatches, AnalyzedTokenReadings tokenReadings,
        boolean prevTokenIsDas, String token, boolean nextTokenIsPersonalOrReflexivePronoun,
        AnalyzedSentence sentence) {//from w  w w .jav  a  2 s.  c  o  m
    // e.g. essen -> Essen
    if (prevTokenIsDas && !nextTokenIsPersonalOrReflexivePronoun && Character.isLowerCase(token.charAt(0))
            && !substVerbenExceptions.contains(token) && tokenReadings.hasPosTagStartingWith("VER:INF")
            && !tokenReadings.isIgnoredBySpeller() && !tokenReadings.isImmunized()) {
        addRuleMatch(ruleMatches, sentence, LOWERCASE_MESSAGE, tokenReadings,
                StringTools.uppercaseFirstChar(tokenReadings.getToken()));
    }
}