Example usage for com.google.common.collect Multiset contains

List of usage examples for com.google.common.collect Multiset contains

Introduction

In this page you can find the example usage for com.google.common.collect Multiset contains.

Prototype

@Override
boolean contains(@Nullable Object element);

Source Link

Document

Determines whether this multiset contains the specified element.

Usage

From source file:org.xpect.expectation.impl.ExpectationCollection.java

protected boolean matchesUnordered(ActualCollection actual) {
    if (isPure())
        return items.equals(actual.items);
    Multiset<ActualItem> act = HashMultiset.create(actual);
    for (ExpectationItem item : this) {
        if (item.isWildcard())
            continue;
        if (item.isNegated()) {
            if (act.contains(item))
                return false;
        } else if (!act.contains(item))
            return false;
    }/*from ww  w  . j  a v a2s  . c om*/
    return true;
}

From source file:org.summer.dsl.model.types.util.TypeConformanceComputer.java

/**
 * Keeps the cumulated distance for all the common raw super types of the given references.
 * Interfaces that are more directly implemented will get a lower total count than more general
 * interfaces.//from  w  w  w.ja  v a2 s .co  m
 */
protected void cumulateDistance(final List<JvmTypeReference> references,
        Multimap<JvmType, JvmTypeReference> all, Multiset<JvmType> cumulatedDistance) {
    for (JvmTypeReference other : references) {
        Multiset<JvmType> otherDistance = LinkedHashMultiset.create();
        initializeDistance(other, all, otherDistance);
        cumulatedDistance.retainAll(otherDistance);
        for (Multiset.Entry<JvmType> typeToDistance : otherDistance.entrySet()) {
            if (cumulatedDistance.contains(typeToDistance.getElement()))
                cumulatedDistance.add(typeToDistance.getElement(), typeToDistance.getCount());
        }
    }
}

From source file:org.summer.dsl.xbase.typesystem.conformance.TypeConformanceComputer.java

/**
 * Keeps the cumulated distance for all the common raw super types of the given references.
 * Interfaces that are more directly implemented will get a lower total count than more general
 * interfaces./*  w w  w .j  av  a 2  s  .c o  m*/
 */
protected void cumulateDistance(final List<LightweightTypeReference> references,
        Multimap<JvmType, LightweightTypeReference> all, Multiset<JvmType> cumulatedDistance) {
    for (LightweightTypeReference other : references) {
        Multiset<JvmType> otherDistance = LinkedHashMultiset.create();
        initializeDistance(other, all, otherDistance);
        cumulatedDistance.retainAll(otherDistance);
        for (Multiset.Entry<JvmType> typeToDistance : otherDistance.entrySet()) {
            if (cumulatedDistance.contains(typeToDistance.getElement()))
                cumulatedDistance.add(typeToDistance.getElement(), typeToDistance.getCount());
        }
    }
}

From source file:org.summer.dsl.xbase.typesystem.references.LightweightTypeReference.java

/**
 * Returns the list of all super types which includes the super class and the 
 * implemented interfaces. The type parameters of the provided super types are resolved.
 * That means, the super types of <code>ArrayList&lt;String&gt;</code> includes
 * <code>List&lt;String&gt;</code> and <code>Collection&lt;String&gt;</code> 
 * rather than <code>Collection&lt;E&gt;</code>.
 * /*from w  w w  .  ja  va 2 s .  c  om*/
 * @return the list of all super types, can be empty.
 */
public List<LightweightTypeReference> getAllSuperTypes() {
    final List<LightweightTypeReference> result = Lists.newArrayList();
    final Multiset<JvmType> distances = HashMultiset.create(7);
    final Multiset<JvmType> counterPerType = HashMultiset.create(7);
    collectSuperTypes(new SuperTypeAcceptor() {

        int counter = 0;

        public boolean accept(LightweightTypeReference superType, int distance) {
            JvmType type = superType.getType();
            counterPerType.add(type, counter++);
            if (distances.contains(type)) {
                int currentCount = distances.count(type);
                if (currentCount < distance + 1) {
                    distances.setCount(type, distance + 1);
                } else {
                    return false;
                }
            } else {
                result.add(superType);
                distances.add(type, distance + 1);
            }
            return true;
        }

    });
    Collections.sort(result, new Comparator<LightweightTypeReference>() {
        public int compare(@Nullable LightweightTypeReference o1, @Nullable LightweightTypeReference o2) {
            if (o1 == null || o2 == null) {
                throw new IllegalArgumentException();
            }
            JvmType type1 = o1.getType();
            JvmType type2 = o2.getType();
            if (type1 == null)
                return 1;
            if (type2 == null)
                return -1;
            int distanceCompare = Ints.compare(distances.count(type1), distances.count(type2));
            if (distanceCompare != 0)
                return distanceCompare;
            return Ints.compare(counterPerType.count(type1), counterPerType.count(type2));
        }
    });
    return result;
}

From source file:org.eclipse.xtext.xbase.typesystem.references.LightweightTypeReference.java

/**
 * Returns the list of all super types which includes the super class and the 
 * implemented interfaces. The type parameters of the provided super types are resolved.
 * That means, the super types of <code>ArrayList&lt;String&gt;</code> includes
 * <code>List&lt;String&gt;</code> and <code>Collection&lt;String&gt;</code> 
 * rather than <code>Collection&lt;E&gt;</code>.
 * /* w ww  .  ja  v a 2 s  .  co  m*/
 * @return the list of all super types, can be empty.
 */
public List<LightweightTypeReference> getAllSuperTypes() {
    final List<LightweightTypeReference> result = Lists.newArrayList();
    final Multiset<JvmType> distances = HashMultiset.create(7);
    final Multiset<JvmType> counterPerType = HashMultiset.create(7);
    collectSuperTypes(new SuperTypeAcceptor() {

        int counter = 0;

        @Override
        public boolean accept(LightweightTypeReference superType, int distance) {
            JvmType type = superType.getType();
            counterPerType.add(type, counter++);
            if (distances.contains(type)) {
                int currentCount = distances.count(type);
                if (currentCount < distance + 1) {
                    distances.setCount(type, distance + 1);
                } else {
                    return false;
                }
            } else {
                result.add(superType);
                distances.add(type, distance + 1);
            }
            return true;
        }

    });
    Collections.sort(result, new Comparator<LightweightTypeReference>() {
        @Override
        public int compare(/* @Nullable */ LightweightTypeReference o1,
                /* @Nullable */ LightweightTypeReference o2) {
            if (o1 == null || o2 == null) {
                throw new IllegalArgumentException();
            }
            JvmType type1 = o1.getType();
            JvmType type2 = o2.getType();
            if (type1 == null)
                return 1;
            if (type2 == null)
                return -1;
            int distanceCompare = Ints.compare(distances.count(type1), distances.count(type2));
            if (distanceCompare != 0)
                return distanceCompare;
            return Ints.compare(counterPerType.count(type1), counterPerType.count(type2));
        }
    });
    return result;
}

From source file:org.icgc.dcc.submission.dictionary.DictionaryValidator.java

private void validateCodeLists(Set<DictionaryConstraintViolation> errors,
        Set<DictionaryConstraintViolation> warnings) {
    for (val codeListName : dictionary.getCodeListNames()) {
        val collection = codeListIndex.get(codeListName);
        int count = collection.size();
        if (count == 0) {
            warnings.add(new DictionaryConstraintViolation("Missing code list", codeListName));
            break;
        }/*w  ww.j av  a  2  s .  com*/
        if (count > 1) {
            errors.add(new DictionaryConstraintViolation("Duplicate code lists", collection));
        }

        val codeList = getFirst(collection, null);

        Multiset<String> codes = HashMultiset.create();
        Multiset<String> values = HashMultiset.create();
        for (val term : codeList.getTerms()) {
            codes.add(term.getCode());
            values.add(term.getValue());
        }

        for (val term : codeList.getTerms()) {
            val code = term.getCode();
            val value = term.getValue();

            if (codes.count(code) > 1) {
                errors.add(
                        new DictionaryConstraintViolation("Duplicate code list codes", term, code, codeList));
            }
            if (values.count(value) > 1) {
                errors.add(
                        new DictionaryConstraintViolation("Duplicate code list values", term, value, codeList));
            }
            if (codes.contains(value) && !code.equals(value)) {
                errors.add(new DictionaryConstraintViolation("Non-disjoint code list code and value", term,
                        value, codeList));
            }
        }
    }
}

From source file:fr.ens.transcriptome.aozan.collectors.UndeterminedIndexesProcessThreads.java

/**
 * Compute for a sample the number of clusters that can be recovered.
 * @param sampleName sample name// w ww.  j  av  a2  s .co m
 * @param indicesCounts multiset that contain data to process
 * @param resultKeySuffix the suffix for the run data key entry
 * @return the number of cluster that can be recovered for the sample
 */
private int computeRecoverableSampleClusterCount(final String sampleName, final Multiset<String> indicesCounts,
        final String resultKeySuffix) {

    int recoverableClusterCount = 0;

    if (!this.isSkipProcessResult) {
        // Sum the number of cluster that can be recovered
        if (this.newSamplesIndexes.containsKey(sampleName)) {
            for (final String newIndex : this.newSamplesIndexes.get(sampleName)) {

                if (indicesCounts.contains(newIndex)) {
                    final int count = indicesCounts.count(newIndex);
                    recoverableClusterCount += count;
                }
            }
        }
    }

    // Set the result for the sample
    getResults().put("undeterminedindices" + ".lane" + this.lane + ".sample." + sampleName + resultKeySuffix,
            recoverableClusterCount);

    return recoverableClusterCount;
}

From source file:de.huberlin.german.korpling.laudatioteitool.SplitTEI.java

private TEIValidator.Errors extractPreparationSteps(Document doc)
        throws LaudatioException, IOException, SAXException {
    TEIValidator validator = preparationSchemeURL == null ? new TEIPreparationValidator()
            : new FromURLValidator(preparationSchemeURL);
    Multiset<String> knownPreparationTitles = HashMultiset.create();

    File documentDir = new File(outputDirectory, "PreparationHeader");
    if (!documentDir.exists() && !documentDir.mkdir()) {
        throw new LaudatioException(
                messages.getString("COULD NOT CREATE DIRECTORY") + documentDir.getAbsolutePath());
    }/*from  w  w w . jav a  2 s. co m*/

    Preconditions.checkNotNull(doc.getRootElement().getChild("teiCorpus", null));
    Element preparationRoot = Preconditions
            .checkNotNull(doc.getRootElement().getChild("teiCorpus", null).getChild("teiCorpus", null));

    for (Element preparationHeader : preparationRoot.getChildren("teiHeader", null)) {
        Preconditions.checkState("PreparationHeader".equals(preparationHeader.getAttributeValue("type")));

        // create the subtree for the global corpus header
        Namespace teiNS = Namespace.getNamespace("http://www.tei-c.org/ns/1.0");
        Element tei = new Element("TEI", teiNS);
        tei.addContent(preparationHeader.clone());
        Document newDoc = new Document(tei);

        if (preparationSchemeURL == null) {
            newDoc.addContent(0, new ProcessingInstruction("xml-model",
                    "href=\"" + TEIPreparationValidator.DEFAULT_SCHEME_URL + "\""));
        } else {
            newDoc.addContent(0,
                    new ProcessingInstruction("xml-model", "href=\"" + preparationSchemeURL + "\""));
        }

        // we need to append an empty "text" element after the header
        Element text = new Element("text", teiNS);
        text.setText("");
        tei.addContent(text);

        Element fileDesc = Preconditions
                .checkNotNull(tei.getChild("teiHeader", null).getChild("fileDesc", null));

        String outName = UUID.randomUUID().toString();

        Element titleStmt = Preconditions.checkNotNull(fileDesc.getChild("titleStmt", null));
        Element title = Preconditions.checkNotNull(titleStmt.getChild("title", null));
        String corresp = title.getAttributeValue("corresp");
        if (corresp != null) {
            if (knownPreparationTitles.contains(corresp)) {
                knownPreparationTitles.add(corresp);
                outName = corresp + "_" + knownPreparationTitles.count(corresp);
                log.warn(messages.getString("MORE THAN ONE PREPARATION HEADER"), corresp);
            } else {
                outName = corresp;
                knownPreparationTitles.add(corresp);
            }
        }

        File outputFile = new File(documentDir, outName + ".xml");
        XMLOutputter xmlOut = new XMLOutputter(Format.getPrettyFormat());
        xmlOut.output(newDoc, new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8"));
        log.info(messages.getString("WRITTEN PREPARATION HEADER"), outputFile.getPath());

        validator.validate(outputFile);

    }
    return validator.getErrors();
}

From source file:it.cnr.isti.hpc.dexter.disambiguation.TurkishEntityDisambiguator.java

@Override
public EntityMatchList disambiguate(DexterLocalParams localParams, SpotMatchList sml) {
    entityScoreMap = new HashMap<String, EntityScores>();
    selectedEntities = new HashSet<String>();
    Multiset<String> entityFrequencyMultiset = HashMultiset.create();

    EntityMatchList entities = sml.getEntities();
    String inputText = localParams.getParams().get("text");
    String algorithm = Property.getInstance().get("algorithm");

    String ambigious = Property.getInstance().get("algorithm.ambigious");

    List<Token> inputTokens = Zemberek.getInstance().disambiguateFindTokens(inputText, false, true);
    List<Double> documentVector = DescriptionEmbeddingAverage.getAverageVectorList(inputText);
    Multiset<String> inputTokensMultiset = HashMultiset.create();
    for (Token token : inputTokens) {
        inputTokensMultiset.add(token.getMorphText());
    }/*from ww w  .j a  v  a  2  s .c o  m*/

    Multiset<String> domainMultiset = HashMultiset.create();
    Multiset<String> typeMultiset = HashMultiset.create();
    HashMap<String, Double> entitySimMap = new HashMap<String, Double>();
    // if (printCandidateEntities) {
    // printEntities(entities);
    // }
    HashSet<String> words = new HashSet<String>();
    Multiset<String> leskWords = HashMultiset.create();

    // first pass for finding number of types and domains
    for (int i = 0; i < entities.size(); i++) {
        EntityMatch em = entities.get(i);
        String id = em.getId();
        if (!entityFrequencyMultiset.contains(id)) {
            entityFrequencyMultiset.add(id);
            Entity entity = em.getEntity();
            words.add(entity.getShingle().getText());
            String type = entity.getPage().getType();
            if (type != null && type.length() > 0) {
                typeMultiset.add(type);
            }
            String domain = entity.getPage().getDomain();
            if (domain != null && domain.length() > 0) {
                domainMultiset.add(domain);
            }

            String desc = entity.getPage().getDescription();
            List<Token> tokens = Zemberek.getInstance().disambiguateFindTokens(desc, false, true);
            for (Token token : tokens) {
                leskWords.add(token.getMorphText());
            }

        } else {
            entityFrequencyMultiset.add(id);
        }
    }

    int maxDomainCount = 0;
    for (String domain : Multisets.copyHighestCountFirst(domainMultiset).elementSet()) {
        maxDomainCount = domainMultiset.count(domain);
        break;
    }
    int maxTypeCount = 0;
    for (String type : Multisets.copyHighestCountFirst(typeMultiset).elementSet()) {
        maxTypeCount = typeMultiset.count(type);
        break;
    }

    double maxSuffixScore = 0, maxLeskScore = 0, maxSimpleLeskScore = 0, maxLinkScore = 0,
            maxHashInfoboxScore = 0, maxwordvecDescriptionLocalScore = 0, maxHashDescriptionScore = 0,
            maxPopularityScore = 0, maxWordvectorAverage = 0, maxWordvecLinksScore = 0;
    // second pass compute similarities between entities in a window
    int currentSpotIndex = -1;
    SpotMatch currentSpot = null;
    for (int i = 0; i < entities.size(); i++) {
        EntityMatch em = entities.get(i);
        SpotMatch spot = em.getSpot();
        if (currentSpot == null || spot != currentSpot) {
            currentSpotIndex++;
            currentSpot = spot;
        }

        String id = em.getId();
        Entity entity = entities.get(i).getEntity();
        EntityPage page = entities.get(i).getEntity().getPage();
        String domain = page.getDomain();
        String type = page.getType();
        Shingle shingle = entity.getShingle();

        /* windowing algorithms stars */
        int left = currentSpotIndex - window;
        int right = currentSpotIndex + window;
        if (left < 0) {
            right -= left;
            left = 0;
        }
        if (right > sml.size()) {
            left += (sml.size()) - right;
            right = sml.size();
            if (left < 0) {
                left = 0;
            }
        }

        double linkScore = 0, hashInfoboxScore = 0, wordvecDescriptionLocalScore = 0, hashDescriptionScore = 0,
                wordvecLinksScore = 0;
        for (int j = left; j < right; j++) {
            SpotMatch sm2 = sml.get(j);
            EntityMatchList entities2 = sm2.getEntities();
            for (EntityMatch em2 : entities2) {
                String id2 = em2.getId();
                EntityPage page2 = em2.getEntity().getPage();
                int counter = 0;
                if (!ambigious.equals("true")) {
                    for (EntityMatch entityMatch : entities2) {
                        if (entityMatch.getId().startsWith("w")) {
                            counter++;
                        }
                    }
                }

                if ((ambigious.equals("true") || counter == 1) && em.getSpot() != em2.getSpot()
                        && !id.equals(id2)) {
                    // Link Similarity calculation starts
                    double linkSim = 0;
                    if (id.startsWith("w") && id2.startsWith("w")) {
                        if (entitySimMap.containsKey("link" + id + id2)) {
                            linkSim = entitySimMap.get("link" + id + id2);
                        } else {
                            HashSet<String> set1 = Sets.newHashSet(page.getLinks().split(" "));
                            HashSet<String> set2 = Sets.newHashSet(page2.getLinks().split(" "));
                            linkSim = JaccardCalculator.calculateSimilarity(set1, set2);
                            entitySimMap.put("link" + id + id2, linkSim);
                        }
                        linkScore += linkSim;
                        // Link Similarity calculation ends
                    }
                    // Entity embedding similarity calculation starts
                    double eeSim = 0;
                    if (id.startsWith("w") && id2.startsWith("w")) {
                        if (entitySimMap.containsKey("ee" + id + id2)) {
                            eeSim = entitySimMap.get("ee" + id + id2);
                        } else {
                            eeSim = EntityEmbeddingSimilarity.getInstance().getSimilarity(page, page2);
                            entitySimMap.put("ee" + id + id2, eeSim);
                        }
                        hashInfoboxScore += eeSim;
                    }
                    double w2veclinksSim = 0;
                    if (id.startsWith("w") && id2.startsWith("w")) {
                        if (entitySimMap.containsKey("wl" + id + id2)) {
                            w2veclinksSim = entitySimMap.get("wl" + id + id2);
                        } else {
                            w2veclinksSim = AveragePooling.getInstance().getSimilarity(page.getWord2vec(),
                                    page2.getWord2vec());
                            entitySimMap.put("wl" + id + id2, w2veclinksSim);
                        }
                        wordvecLinksScore += w2veclinksSim;
                    }

                    // Entity embedding similarity calculation ends

                    // Description word2vec similarity calculation
                    // starts
                    double word2vecSim = 0;

                    if (entitySimMap.containsKey("w2v" + id + id2)) {
                        word2vecSim = entitySimMap.get("w2v" + id + id2);
                    } else {
                        word2vecSim = AveragePooling.getInstance().getSimilarity(page2.getDword2vec(),
                                page.getDword2vec());
                        entitySimMap.put("w2v" + id + id2, word2vecSim);
                    }
                    wordvecDescriptionLocalScore += word2vecSim;
                    // Description word2vec similarity calculation ends

                    // Description autoencoder similarity calculation
                    // starts
                    double autoVecSim = 0;

                    if (entitySimMap.containsKey("a2v" + id + id2)) {
                        autoVecSim = entitySimMap.get("a2v" + id + id2);
                    } else {
                        autoVecSim = AveragePooling.getInstance().getSimilarity(page2.getDautoencoder(),
                                page.getDautoencoder());
                        entitySimMap.put("a2v" + id + id2, autoVecSim);
                    }
                    hashDescriptionScore += autoVecSim;
                    // Description autoencoder similarity calculation
                    // ends

                }
            }
        }
        if (linkScore > maxLinkScore) {
            maxLinkScore = linkScore;
        }
        if (hashInfoboxScore > maxHashInfoboxScore) {
            maxHashInfoboxScore = hashInfoboxScore;
        }
        if (wordvecDescriptionLocalScore > maxwordvecDescriptionLocalScore) {
            maxwordvecDescriptionLocalScore = wordvecDescriptionLocalScore;
        }
        if (hashDescriptionScore > maxHashDescriptionScore) {
            maxHashDescriptionScore = hashDescriptionScore;
        }
        if (wordvecLinksScore > maxWordvecLinksScore) {
            maxWordvecLinksScore = wordvecLinksScore;
        }

        /* windowing algorithms ends */

        double domainScore = 0;
        if (domainMultiset.size() > 0 && maxDomainCount > 1 && domainMultiset.count(domain) > 1) {
            domainScore = (double) domainMultiset.count(domain) / maxDomainCount;
        }
        double typeScore = 0;
        if (typeMultiset.size() > 0 && maxTypeCount > 1 && typeMultiset.count(type) > 1) {
            typeScore = (double) typeMultiset.count(type) / maxTypeCount;
        }
        if (typeBlackList.contains(type)) {
            typeScore /= 10;
        }

        double typeContentScore = 0;
        if (type.length() > 0 && StringUtils.containsIgnoreCase(words.toString(), type)) {
            typeContentScore = 1;
        }

        double typeClassifierScore = TypeClassifier.getInstance().predict(page, page.getTitle(), page.getType(),
                entity.getShingle().getSentence());

        double wordvecDescriptionScore = AveragePooling.getInstance().getSimilarity(documentVector,
                page.getDword2vec());
        if (wordvecDescriptionScore > maxWordvectorAverage) {
            maxWordvectorAverage = wordvecDescriptionScore;
        }

        double suffixScore = 0;

        if (type != null && type.length() > 0) {
            Set<String> suffixes = new HashSet<String>();
            String t = entity.getTitle().toLowerCase(new Locale("tr", "TR"));

            for (int x = 0; x < entities.size(); x++) {
                EntityMatch e2 = entities.get(x);
                if (e2.getId().equals(entity.getId())) {
                    suffixes.add(e2.getMention());
                }
            }
            suffixes.remove(t);
            suffixes.remove(entity.getTitle());
            // String inputTextLower = inputText.toLowerCase(new
            // Locale("tr",
            // "TR"));
            // while (inputTextLower.contains(t)) {
            // int start = inputTextLower.indexOf(t);
            // int end = inputTextLower.indexOf(" ", start + t.length());
            // if (end > start) {
            // String suffix = inputTextLower.substring(start, end);
            // // .replaceAll("\\W", "");
            // if (suffix.contains("'")
            // || (Zemberek.getInstance().hasMorph(suffix)
            // && !suffix.equals(t) && suffix.length() > 4)) {
            // suffixes.add(suffix);
            // }
            // inputTextLower = inputTextLower.substring(end);
            // } else {
            // break;
            // }
            // }
            if (suffixes.size() >= minSuffix) {
                for (String suffix : suffixes) {
                    double sim = gd.calculateSimilarity(suffix, type);
                    suffixScore += sim;
                }
            }
        }

        // String entitySuffix = page.getSuffix();
        // String[] inputSuffix = shingle.getSuffix().split(" ");
        // for (int j = 0; j < inputSuffix.length; j++) {
        // if (entitySuffix.contains(inputSuffix[j])) {
        // suffixScore += 0.25f;
        // }
        // }

        if (suffixScore > maxSuffixScore) {
            maxSuffixScore = suffixScore;
        }
        // if (id.equals("w691538")) {
        // LOGGER.info("");
        // }
        double letterCaseScore = 0;
        int lc = page.getLetterCase();
        if (StringUtils.isAllLowerCase(em.getMention()) && lc == 0 && id.startsWith("t")) {
            letterCaseScore = 1;
        } else if (StringUtils.isAllUpperCase(em.getMention()) && lc == 1 && id.startsWith("w")) {
            letterCaseScore = 1;
        } else if (Character.isUpperCase(em.getMention().charAt(0)) && lc == 2 && id.startsWith("w")) {
            letterCaseScore = 1;
        } else if (StringUtils.isAllLowerCase(em.getMention()) && id.startsWith("t")) {
            letterCaseScore = 1;
        }

        double nameScore = 1 - LevenshteinDistanceCalculator.calculateDistance(page.getTitle(),
                Zemberek.removeAfterSpostrophe(em.getMention()));

        double popularityScore = page.getRank();
        if (id.startsWith("w")) {
            popularityScore = Math.log10(popularityScore + 1);
            if (popularityScore > maxPopularityScore) {
                maxPopularityScore = popularityScore;
            }
        }

        double leskScore = 0, simpleLeskScore = 0;

        String desc = em.getEntity().getPage().getDescription();
        if (desc != null) {
            List<Token> tokens = Zemberek.getInstance().disambiguateFindTokens(desc, false, true);
            for (Token token : tokens) {
                if (inputTokensMultiset.contains(token.getMorphText())
                        && !TurkishNLP.isStopWord(token.getMorphText())) {
                    simpleLeskScore += inputTokensMultiset.count(token.getMorphText());
                }
                if (leskWords.contains(token.getMorphText()) && !TurkishNLP.isStopWord(token.getMorphText())) {
                    leskScore += leskWords.count(token.getMorphText());
                }

            }
            leskScore /= Math.log(tokens.size() + 1);
            simpleLeskScore /= Math.log(tokens.size() + 1);
            if (leskScore > maxLeskScore) {
                maxLeskScore = leskScore;
            }
            if (simpleLeskScore > maxSimpleLeskScore) {
                maxSimpleLeskScore = simpleLeskScore;
            }

            if (!entityScoreMap.containsKey(id)) {
                EntityScores scores = new EntityScores(em, id, popularityScore, nameScore, letterCaseScore,
                        suffixScore, wordvecDescriptionScore, typeContentScore, typeScore, domainScore,
                        hashDescriptionScore, wordvecDescriptionLocalScore, hashInfoboxScore, linkScore,
                        wordvecLinksScore, leskScore, simpleLeskScore, typeClassifierScore);
                entityScoreMap.put(id, scores);
            } else {
                EntityScores entityScores = entityScoreMap.get(id);
                entityScores.setHashInfoboxScore((entityScores.getHashInfoboxScore() + hashInfoboxScore) / 2);
                entityScores.setHashDescriptionScore(
                        (entityScores.getHashInfoboxScore() + hashDescriptionScore) / 2);
                entityScores.setLinkScore((entityScores.getLinkScore() + linkScore) / 2);
                entityScores.setWordvecDescriptionLocalScore(
                        (entityScores.getWordvecDescriptionLocalScore() + wordvecDescriptionLocalScore) / 2);
                entityScores
                        .setWordvecLinksScore((entityScores.getWordvecLinksScore() + wordvecLinksScore) / 2);
                entityScores.setLeskScore((entityScores.getLeskScore() + leskScore) / 2);

            }

        }
    }
    /* normalization and total score calculation starts */
    Set<String> set = new HashSet<String>();
    for (int i = 0; i < entities.size(); i++) {
        EntityMatch em = entities.get(i);
        String id = em.getId();
        EntityScores entityScores = entityScoreMap.get(id);
        if (set.contains(id)) {
            continue;
        }
        if (id.startsWith("w")) {
            if (maxLinkScore > 0 && entityScores.getLinkScore() > 0) {
                entityScores.setLinkScore(entityScores.getLinkScore() / maxLinkScore);
            }
            if (maxHashInfoboxScore > 0 && entityScores.getHashInfoboxScore() > 0) {
                entityScores.setHashInfoboxScore(entityScores.getHashInfoboxScore() / maxHashInfoboxScore);
            }
            if (maxWordvecLinksScore > 0 && entityScores.getWordvecLinksScore() > 0) {
                entityScores.setWordvecLinksScore(entityScores.getWordvecLinksScore() / maxWordvecLinksScore);
            }
            if (maxPopularityScore > 0 && entityScores.getPopularityScore() > 0) {
                entityScores.setPopularityScore(entityScores.getPopularityScore() / maxPopularityScore);
            }
        }
        if (maxwordvecDescriptionLocalScore > 0 && entityScores.getWordvecDescriptionLocalScore() > 0) {
            entityScores.setWordvecDescriptionLocalScore(
                    entityScores.getWordvecDescriptionLocalScore() / maxwordvecDescriptionLocalScore);
        }
        if (maxHashDescriptionScore > 0 && entityScores.getHashDescriptionScore() > 0) {
            entityScores
                    .setHashDescriptionScore(entityScores.getHashDescriptionScore() / maxHashDescriptionScore);
        }
        if (maxWordvectorAverage > 0 && entityScores.getWordvecDescriptionScore() > 0) {
            entityScores.setWordvecDescriptionScore(
                    entityScores.getWordvecDescriptionScore() / maxWordvectorAverage);
        }
        if (maxLeskScore > 0 && entityScores.getLeskScore() > 0) {
            entityScores.setLeskScore(entityScores.getLeskScore() / maxLeskScore);
        }
        if (maxSimpleLeskScore > 0 && entityScores.getSimpleLeskScore() > 0) {
            entityScores.setSimpleLeskScore(entityScores.getSimpleLeskScore() / maxSimpleLeskScore);
        }
        if (maxSuffixScore > 0 && entityScores.getSuffixScore() > 0) {
            entityScores.setSuffixScore(entityScores.getSuffixScore() / maxSuffixScore);
        }
        set.add(id);
    }

    LOGGER.info("\t"
            + "id\tTitle\tURL\tScore\tPopularity\tName\tLesk\tSimpeLesk\tCase\tNoun\tSuffix\tTypeContent\tType\tDomain\twordvecDescription\twordvecDescriptionLocal\thashDescription\thashInfobox\tword2vecLinks\tLink\t\ttypeClassifier\tDescription");
    for (int i = 0; i < entities.size(); i++) {
        EntityMatch em = entities.get(i);
        String id = em.getId();
        EntityScores e = entityScoreMap.get(id);
        double wikiScore = 0;
        if (id.startsWith("w") && Character.isUpperCase(em.getMention().charAt(0))) {
            wikiScore = wikiWeight;
        } else if (id.startsWith("t") && Character.isLowerCase(em.getMention().charAt(0))) {
            wikiScore = wikiWeight;
        }
        // if(id.equals("w508792")){
        // LOGGER.info("");
        // }
        double totalScore = wikiScore + e.getPopularityScore() * popularityWeight
                + e.getNameScore() * nameWeight + e.getLeskScore() * leskWeight
                + e.getSimpleLeskScore() * simpleLeskWeight + e.getLetterCaseScore() * letterCaseWeight
                + e.getSuffixScore() * suffixWeight + e.getTypeContentScore() * typeContentWeight
                + e.getTypeScore() * typeWeight + e.getDomainScore() * domainWeight
                + e.getWordvecDescriptionScore() * wordvecDescriptionWeight
                + e.getWordvecDescriptionLocalScore() * wordvecDescriptionLocalWeight
                + e.getHashDescriptionScore() * hashDescriptionWeight
                + e.getHashInfoboxScore() * hashInfoboxWeight + e.getWordvecLinksScore() * word2vecLinksWeight
                + e.getLinkScore() * linkWeight + e.getTypeClassifierkScore() * typeClassifierkWeight;
        if (ranklib == true) {
            totalScore = RankLib.getInstance().score(e);
        }

        if (em.getEntity().getPage().getUrlTitle().contains("(")) {
            totalScore /= 2;
        }
        em.setScore(totalScore);
        e.setScore(totalScore);

        LOGGER.info("\t" + id + "\t" + em.getEntity().getPage().getTitle() + "\t"
                + em.getEntity().getPage().getUrlTitle() + "\t" + em.getScore() + "\t"
                + e.getPopularityScore() * popularityWeight + "\t" + e.getNameScore() * nameWeight + "\t"
                + e.getLeskScore() * leskWeight + "\t" + e.getSimpleLeskScore() * simpleLeskWeight + "\t"
                + e.getLetterCaseScore() * letterCaseWeight + "\t" + e.getSuffixScore() * suffixWeight + "\t"
                + e.getTypeContentScore() * typeContentWeight + "\t" + e.getTypeScore() * typeWeight + "\t"
                + e.getDomainScore() * domainWeight + "\t"
                + e.getWordvecDescriptionScore() * wordvecDescriptionWeight + "\t"
                + e.getWordvecDescriptionLocalScore() * wordvecDescriptionLocalWeight + "\t"
                + e.getHashDescriptionScore() * hashDescriptionWeight + "\t"
                + e.getHashInfoboxScore() * hashInfoboxWeight + "\t"
                + e.getWordvecLinksScore() * word2vecLinksWeight + "\t" + e.getLinkScore() * linkWeight + "\t"
                + e.getTypeClassifierkScore() * typeClassifierkWeight + "\t"
                + em.getEntity().getPage().getDescription());
    }

    // if (annotateEntities) {
    // annotateEntities(localParams.getParams().get("originalText"), sml);
    // }

    EntityMatchList eml = new EntityMatchList();
    for (SpotMatch match : sml) {
        EntityMatchList list = match.getEntities();
        if (!list.isEmpty()) {
            list.sort();
            eml.add(list.get(0));
            selectedEntities.add(list.get(0).getId());
        }
    }
    return eml;
}

From source file:org.opencb.opencga.storage.mongodb.variant.VariantMongoDBAdaptor.java

/**
 * Two steps insertion:/* www  . ja  va 2 s.  c  om*/
 * First check that the variant and study exists making an update.
 * For those who doesn't exist, pushes a study with the file and genotype information
 * <p>
 * The documents that throw a "dup key" exception are those variants that exist and have the study.
 * Then, only for those variants, make a second update.
 * <p>
 * *An interesting idea would be to invert this actions depending on the number of already inserted variants.
 *
 * @param data Variants to insert
 * @param fileId File ID
 * @param variantConverter Variant converter to be used
 * @param variantSourceEntryConverter Variant source converter to be used
 * @param studyConfiguration Configuration for the study
 * @param loadedSampleIds Other loaded sampleIds EXCEPT those that are going to be loaded
 * @return QueryResult object
 */
QueryResult<MongoDBVariantWriteResult> insert(List<Variant> data, int fileId,
        DocumentToVariantConverter variantConverter,
        DocumentToStudyVariantEntryConverter variantSourceEntryConverter, StudyConfiguration studyConfiguration,
        List<Integer> loadedSampleIds) {

    MongoDBVariantWriteResult writeResult = new MongoDBVariantWriteResult();
    long startTime = System.currentTimeMillis();
    if (data.isEmpty()) {
        return new QueryResult<>("insertVariants", 0, 1, 1, "", "", Collections.singletonList(writeResult));
    }
    List<Bson> queries = new ArrayList<>(data.size());
    List<Bson> updates = new ArrayList<>(data.size());
    // Use a multiset instead of a normal set, to keep tracking of duplicated variants
    Multiset<String> nonInsertedVariants = HashMultiset.create();
    String fileIdStr = Integer.toString(fileId);

    //        List<String> extraFields = studyConfiguration.getAttributes().getAsStringList(VariantStorageManager.Options.EXTRA_GENOTYPE_FIELDS
    //                .key());
    boolean excludeGenotypes = studyConfiguration.getAttributes().getBoolean(
            VariantStorageManager.Options.EXCLUDE_GENOTYPES.key(),
            VariantStorageManager.Options.EXCLUDE_GENOTYPES.defaultValue());

    long nanoTime = System.nanoTime();
    Map missingSamples = Collections.emptyMap();
    String defaultGenotype = studyConfiguration.getAttributes().getString(DEFAULT_GENOTYPE.key(), "");
    if (defaultGenotype.equals(DocumentToSamplesConverter.UNKNOWN_GENOTYPE)) {
        logger.debug("Do not need fill gaps. DefaultGenotype is UNKNOWN_GENOTYPE({}).",
                DocumentToSamplesConverter.UNKNOWN_GENOTYPE);
    } else if (excludeGenotypes) {
        logger.debug("Do not need fill gaps. Excluding genotypes.");
    } else if (!loadedSampleIds.isEmpty()) {
        missingSamples = new Document(DocumentToSamplesConverter.UNKNOWN_GENOTYPE, loadedSampleIds); // ?/?
    }
    //            List<Object> missingOtherValues = new ArrayList<>(loadedSampleIds.size());
    //            for (int i = 0; i < loadedSampleIds.size(); i++) {
    //                missingOtherValues.add(DBObjectToSamplesConverter.UNKNOWN_FIELD);
    //            }
    for (Variant variant : data) {
        if (variant.getType().equals(VariantType.NO_VARIATION)) {
            //Storage-MongoDB is not able to store NON VARIANTS
            writeResult.setSkippedVariants(writeResult.getSkippedVariants() + 1);
            continue;
        } else if (variant.getType().equals(VariantType.SYMBOLIC)) {
            logger.warn("Skip symbolic variant " + variant.toString());
            writeResult.setSkippedVariants(writeResult.getSkippedVariants() + 1);
            continue;
        }
        String id = variantConverter.buildStorageId(variant);
        for (StudyEntry studyEntry : variant.getStudies()) {
            if (studyEntry.getFiles().size() == 0
                    || !studyEntry.getFiles().get(0).getFileId().equals(fileIdStr)) {
                continue;
            }
            int studyId = studyConfiguration.getStudyId();
            Document study = variantSourceEntryConverter.convertToStorageType(studyEntry);
            Document genotypes = study.get(DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD,
                    Document.class);
            if (genotypes != null) { //If genotypes is null, genotypes are not suppose to be loaded
                genotypes.putAll(missingSamples); //Add missing samples
                //                        for (String extraField : extraFields) {
                //                            List<Object> otherFieldValues = (List<Object>) study.get(extraField.toLowerCase());
                //                            otherFieldValues.addAll(0, missingOtherValues);
                //                        }
            }
            Document push = new Document(DocumentToVariantConverter.STUDIES_FIELD, study);
            Document update = new Document().append("$push", push).append("$setOnInsert",
                    variantConverter.convertToStorageType(variant));
            if (variant.getIds() != null && !variant.getIds().isEmpty()
                    && !variant.getIds().iterator().next().isEmpty()) {
                update.put("$addToSet", new Document(DocumentToVariantConverter.IDS_FIELD,
                        new Document("$each", variant.getIds())));
            }
            // { _id: <variant_id>, "studies.sid": {$ne: <studyId> } }
            //If the variant exists and contains the study, this find will fail, will try to do the upsert, and throw a
            // duplicated key exception.
            queries.add(new Document("_id", id).append(
                    DocumentToVariantConverter.STUDIES_FIELD + "."
                            + DocumentToStudyVariantEntryConverter.STUDYID_FIELD,
                    new Document("$ne", studyId)));
            updates.add(update);
        }
    }

    //
    if (!queries.isEmpty()) {
        QueryOptions options = new QueryOptions(UPSERT, true);
        options.put(MULTI, false);
        int newDocuments;
        int updatedObjects;

        try {
            BulkWriteResult bulkWriteResult;
            bulkWriteResult = variantsCollection.update(queries, updates, options).first();
            newDocuments = bulkWriteResult.getUpserts().size();
            updatedObjects = bulkWriteResult.getModifiedCount();
        } catch (MongoBulkWriteException e) {
            BulkWriteResult bulkWriteResult;
            bulkWriteResult = e.getWriteResult();
            newDocuments = bulkWriteResult.getUpserts().size();
            updatedObjects = bulkWriteResult.getModifiedCount();
            for (BulkWriteError writeError : e.getWriteErrors()) {
                if (writeError.getCode() == 11000) { //Dup Key error code
                    Matcher matcher = writeResultErrorPattern.matcher(writeError.getMessage());
                    if (matcher.find()) {
                        String id = matcher.group(1);
                        nonInsertedVariants.add(id);
                    } else {
                        throw e;
                    }
                } else {
                    throw e;
                }
            }
        }

        writeResult.setNewVariants(newDocuments);
        writeResult.setUpdatedVariants(updatedObjects);
        //                writeResult.setNewDocuments(data.size() - nonInsertedVariants.size() - writeResult.getSkippedVariants());
        queries.clear();
        updates.clear();
    }
    writeResult.setNewVariantsNanoTime(System.nanoTime() - nanoTime);
    nanoTime = System.nanoTime();

    for (Variant variant : data) {
        variant.setAnnotation(null);
        String id = variantConverter.buildStorageId(variant);

        if (nonInsertedVariants != null && !nonInsertedVariants.contains(id)) {
            continue; //Already inserted variant
        }

        for (StudyEntry studyEntry : variant.getStudies()) {
            if (studyEntry.getFiles().size() == 0
                    || !studyEntry.getFiles().get(0).getFileId().equals(fileIdStr)) {
                continue;
            }

            Document studyObject = variantSourceEntryConverter.convertToStorageType(studyEntry);
            Document genotypes = studyObject.get(DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD,
                    Document.class);
            Document push = new Document();

            if (!excludeGenotypes) {
                if (genotypes != null) { //If genotypes is null, genotypes are not suppose to be loaded
                    for (String genotype : genotypes.keySet()) {
                        push.put(
                                DocumentToVariantConverter.STUDIES_FIELD + ".$."
                                        + DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD + "." + genotype,
                                new Document("$each", genotypes.get(genotype)));
                    }
                    //                    for (String extraField : extraFields) {
                    //                        List values = (List) studyObject.get(extraField.toLowerCase());
                    //                        push.put(DBObjectToVariantConverter.STUDIES_FIELD + ".$." + extraField.toLowerCase(),
                    //                                new Document("$each", values).append("$position", loadedSampleIds.size()));
                    //                    }
                } else {
                    push.put(
                            DocumentToVariantConverter.STUDIES_FIELD + ".$."
                                    + DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD,
                            Collections.emptyMap());
                }
            }
            push.put(
                    DocumentToVariantConverter.STUDIES_FIELD + ".$."
                            + DocumentToStudyVariantEntryConverter.FILES_FIELD,
                    ((List) studyObject.get(DocumentToStudyVariantEntryConverter.FILES_FIELD)).get(0));
            Document update = new Document(new Document("$push", push));

            queries.add(new Document("_id", id)
                    .append(DocumentToVariantConverter.STUDIES_FIELD + '.'
                            + DocumentToStudyVariantEntryConverter.STUDYID_FIELD,
                            studyConfiguration.getStudyId())
                    .append(DocumentToVariantConverter.STUDIES_FIELD + '.'
                            + DocumentToStudyVariantEntryConverter.FILES_FIELD + '.'
                            + DocumentToStudyVariantEntryConverter.FILEID_FIELD, new Document("$ne", fileId)));
            updates.add(update);

        }
    }
    writeResult.setExistingVariantsNanoTime(System.nanoTime() - nanoTime);

    if (!queries.isEmpty()) {
        QueryOptions options = new QueryOptions(UPSERT, false);
        options.put(MULTI, false);
        QueryResult<BulkWriteResult> update = variantsCollection.update(queries, updates, options);
        // Can happen that nonInsertedVariantsNum != queries.size() != nonInsertedVariants.size() if there was
        // a duplicated variant.
        writeResult.setNonInsertedVariants(nonInsertedVariants.size() - update.first().getMatchedCount());
        writeResult.setUpdatedVariants(writeResult.getUpdatedVariants() + update.first().getModifiedCount());
    }

    return new QueryResult<>("insertVariants", ((int) (System.currentTimeMillis() - startTime)), 1, 1, "", "",
            Collections.singletonList(writeResult));
}