List of usage examples for com.google.common.collect Multiset contains
@Override
boolean contains(@Nullable Object element);
From source file:org.xpect.expectation.impl.ExpectationCollection.java
protected boolean matchesUnordered(ActualCollection actual) { if (isPure()) return items.equals(actual.items); Multiset<ActualItem> act = HashMultiset.create(actual); for (ExpectationItem item : this) { if (item.isWildcard()) continue; if (item.isNegated()) { if (act.contains(item)) return false; } else if (!act.contains(item)) return false; }/*from ww w . j a v a2s . c om*/ return true; }
From source file:org.summer.dsl.model.types.util.TypeConformanceComputer.java
/** * Keeps the cumulated distance for all the common raw super types of the given references. * Interfaces that are more directly implemented will get a lower total count than more general * interfaces.//from w w w.ja v a2 s .co m */ protected void cumulateDistance(final List<JvmTypeReference> references, Multimap<JvmType, JvmTypeReference> all, Multiset<JvmType> cumulatedDistance) { for (JvmTypeReference other : references) { Multiset<JvmType> otherDistance = LinkedHashMultiset.create(); initializeDistance(other, all, otherDistance); cumulatedDistance.retainAll(otherDistance); for (Multiset.Entry<JvmType> typeToDistance : otherDistance.entrySet()) { if (cumulatedDistance.contains(typeToDistance.getElement())) cumulatedDistance.add(typeToDistance.getElement(), typeToDistance.getCount()); } } }
From source file:org.summer.dsl.xbase.typesystem.conformance.TypeConformanceComputer.java
/** * Keeps the cumulated distance for all the common raw super types of the given references. * Interfaces that are more directly implemented will get a lower total count than more general * interfaces./* w w w .j av a 2 s .c o m*/ */ protected void cumulateDistance(final List<LightweightTypeReference> references, Multimap<JvmType, LightweightTypeReference> all, Multiset<JvmType> cumulatedDistance) { for (LightweightTypeReference other : references) { Multiset<JvmType> otherDistance = LinkedHashMultiset.create(); initializeDistance(other, all, otherDistance); cumulatedDistance.retainAll(otherDistance); for (Multiset.Entry<JvmType> typeToDistance : otherDistance.entrySet()) { if (cumulatedDistance.contains(typeToDistance.getElement())) cumulatedDistance.add(typeToDistance.getElement(), typeToDistance.getCount()); } } }
From source file:org.summer.dsl.xbase.typesystem.references.LightweightTypeReference.java
/** * Returns the list of all super types which includes the super class and the * implemented interfaces. The type parameters of the provided super types are resolved. * That means, the super types of <code>ArrayList<String></code> includes * <code>List<String></code> and <code>Collection<String></code> * rather than <code>Collection<E></code>. * /*from w w w . ja va 2 s . c om*/ * @return the list of all super types, can be empty. */ public List<LightweightTypeReference> getAllSuperTypes() { final List<LightweightTypeReference> result = Lists.newArrayList(); final Multiset<JvmType> distances = HashMultiset.create(7); final Multiset<JvmType> counterPerType = HashMultiset.create(7); collectSuperTypes(new SuperTypeAcceptor() { int counter = 0; public boolean accept(LightweightTypeReference superType, int distance) { JvmType type = superType.getType(); counterPerType.add(type, counter++); if (distances.contains(type)) { int currentCount = distances.count(type); if (currentCount < distance + 1) { distances.setCount(type, distance + 1); } else { return false; } } else { result.add(superType); distances.add(type, distance + 1); } return true; } }); Collections.sort(result, new Comparator<LightweightTypeReference>() { public int compare(@Nullable LightweightTypeReference o1, @Nullable LightweightTypeReference o2) { if (o1 == null || o2 == null) { throw new IllegalArgumentException(); } JvmType type1 = o1.getType(); JvmType type2 = o2.getType(); if (type1 == null) return 1; if (type2 == null) return -1; int distanceCompare = Ints.compare(distances.count(type1), distances.count(type2)); if (distanceCompare != 0) return distanceCompare; return Ints.compare(counterPerType.count(type1), counterPerType.count(type2)); } }); return result; }
From source file:org.eclipse.xtext.xbase.typesystem.references.LightweightTypeReference.java
/** * Returns the list of all super types which includes the super class and the * implemented interfaces. The type parameters of the provided super types are resolved. * That means, the super types of <code>ArrayList<String></code> includes * <code>List<String></code> and <code>Collection<String></code> * rather than <code>Collection<E></code>. * /* w ww . ja v a 2 s . co m*/ * @return the list of all super types, can be empty. */ public List<LightweightTypeReference> getAllSuperTypes() { final List<LightweightTypeReference> result = Lists.newArrayList(); final Multiset<JvmType> distances = HashMultiset.create(7); final Multiset<JvmType> counterPerType = HashMultiset.create(7); collectSuperTypes(new SuperTypeAcceptor() { int counter = 0; @Override public boolean accept(LightweightTypeReference superType, int distance) { JvmType type = superType.getType(); counterPerType.add(type, counter++); if (distances.contains(type)) { int currentCount = distances.count(type); if (currentCount < distance + 1) { distances.setCount(type, distance + 1); } else { return false; } } else { result.add(superType); distances.add(type, distance + 1); } return true; } }); Collections.sort(result, new Comparator<LightweightTypeReference>() { @Override public int compare(/* @Nullable */ LightweightTypeReference o1, /* @Nullable */ LightweightTypeReference o2) { if (o1 == null || o2 == null) { throw new IllegalArgumentException(); } JvmType type1 = o1.getType(); JvmType type2 = o2.getType(); if (type1 == null) return 1; if (type2 == null) return -1; int distanceCompare = Ints.compare(distances.count(type1), distances.count(type2)); if (distanceCompare != 0) return distanceCompare; return Ints.compare(counterPerType.count(type1), counterPerType.count(type2)); } }); return result; }
From source file:org.icgc.dcc.submission.dictionary.DictionaryValidator.java
private void validateCodeLists(Set<DictionaryConstraintViolation> errors, Set<DictionaryConstraintViolation> warnings) { for (val codeListName : dictionary.getCodeListNames()) { val collection = codeListIndex.get(codeListName); int count = collection.size(); if (count == 0) { warnings.add(new DictionaryConstraintViolation("Missing code list", codeListName)); break; }/*w ww.j av a 2 s . com*/ if (count > 1) { errors.add(new DictionaryConstraintViolation("Duplicate code lists", collection)); } val codeList = getFirst(collection, null); Multiset<String> codes = HashMultiset.create(); Multiset<String> values = HashMultiset.create(); for (val term : codeList.getTerms()) { codes.add(term.getCode()); values.add(term.getValue()); } for (val term : codeList.getTerms()) { val code = term.getCode(); val value = term.getValue(); if (codes.count(code) > 1) { errors.add( new DictionaryConstraintViolation("Duplicate code list codes", term, code, codeList)); } if (values.count(value) > 1) { errors.add( new DictionaryConstraintViolation("Duplicate code list values", term, value, codeList)); } if (codes.contains(value) && !code.equals(value)) { errors.add(new DictionaryConstraintViolation("Non-disjoint code list code and value", term, value, codeList)); } } } }
From source file:fr.ens.transcriptome.aozan.collectors.UndeterminedIndexesProcessThreads.java
/** * Compute for a sample the number of clusters that can be recovered. * @param sampleName sample name// w ww. j av a2 s .co m * @param indicesCounts multiset that contain data to process * @param resultKeySuffix the suffix for the run data key entry * @return the number of cluster that can be recovered for the sample */ private int computeRecoverableSampleClusterCount(final String sampleName, final Multiset<String> indicesCounts, final String resultKeySuffix) { int recoverableClusterCount = 0; if (!this.isSkipProcessResult) { // Sum the number of cluster that can be recovered if (this.newSamplesIndexes.containsKey(sampleName)) { for (final String newIndex : this.newSamplesIndexes.get(sampleName)) { if (indicesCounts.contains(newIndex)) { final int count = indicesCounts.count(newIndex); recoverableClusterCount += count; } } } } // Set the result for the sample getResults().put("undeterminedindices" + ".lane" + this.lane + ".sample." + sampleName + resultKeySuffix, recoverableClusterCount); return recoverableClusterCount; }
From source file:de.huberlin.german.korpling.laudatioteitool.SplitTEI.java
private TEIValidator.Errors extractPreparationSteps(Document doc) throws LaudatioException, IOException, SAXException { TEIValidator validator = preparationSchemeURL == null ? new TEIPreparationValidator() : new FromURLValidator(preparationSchemeURL); Multiset<String> knownPreparationTitles = HashMultiset.create(); File documentDir = new File(outputDirectory, "PreparationHeader"); if (!documentDir.exists() && !documentDir.mkdir()) { throw new LaudatioException( messages.getString("COULD NOT CREATE DIRECTORY") + documentDir.getAbsolutePath()); }/*from w w w . jav a 2 s. co m*/ Preconditions.checkNotNull(doc.getRootElement().getChild("teiCorpus", null)); Element preparationRoot = Preconditions .checkNotNull(doc.getRootElement().getChild("teiCorpus", null).getChild("teiCorpus", null)); for (Element preparationHeader : preparationRoot.getChildren("teiHeader", null)) { Preconditions.checkState("PreparationHeader".equals(preparationHeader.getAttributeValue("type"))); // create the subtree for the global corpus header Namespace teiNS = Namespace.getNamespace("http://www.tei-c.org/ns/1.0"); Element tei = new Element("TEI", teiNS); tei.addContent(preparationHeader.clone()); Document newDoc = new Document(tei); if (preparationSchemeURL == null) { newDoc.addContent(0, new ProcessingInstruction("xml-model", "href=\"" + TEIPreparationValidator.DEFAULT_SCHEME_URL + "\"")); } else { newDoc.addContent(0, new ProcessingInstruction("xml-model", "href=\"" + preparationSchemeURL + "\"")); } // we need to append an empty "text" element after the header Element text = new Element("text", teiNS); text.setText(""); tei.addContent(text); Element fileDesc = Preconditions .checkNotNull(tei.getChild("teiHeader", null).getChild("fileDesc", null)); String outName = UUID.randomUUID().toString(); Element titleStmt = Preconditions.checkNotNull(fileDesc.getChild("titleStmt", null)); Element title = Preconditions.checkNotNull(titleStmt.getChild("title", null)); String corresp = title.getAttributeValue("corresp"); if (corresp != null) { if (knownPreparationTitles.contains(corresp)) { knownPreparationTitles.add(corresp); outName = corresp + "_" + knownPreparationTitles.count(corresp); log.warn(messages.getString("MORE THAN ONE PREPARATION HEADER"), corresp); } else { outName = corresp; knownPreparationTitles.add(corresp); } } File outputFile = new File(documentDir, outName + ".xml"); XMLOutputter xmlOut = new XMLOutputter(Format.getPrettyFormat()); xmlOut.output(newDoc, new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8")); log.info(messages.getString("WRITTEN PREPARATION HEADER"), outputFile.getPath()); validator.validate(outputFile); } return validator.getErrors(); }
From source file:it.cnr.isti.hpc.dexter.disambiguation.TurkishEntityDisambiguator.java
@Override public EntityMatchList disambiguate(DexterLocalParams localParams, SpotMatchList sml) { entityScoreMap = new HashMap<String, EntityScores>(); selectedEntities = new HashSet<String>(); Multiset<String> entityFrequencyMultiset = HashMultiset.create(); EntityMatchList entities = sml.getEntities(); String inputText = localParams.getParams().get("text"); String algorithm = Property.getInstance().get("algorithm"); String ambigious = Property.getInstance().get("algorithm.ambigious"); List<Token> inputTokens = Zemberek.getInstance().disambiguateFindTokens(inputText, false, true); List<Double> documentVector = DescriptionEmbeddingAverage.getAverageVectorList(inputText); Multiset<String> inputTokensMultiset = HashMultiset.create(); for (Token token : inputTokens) { inputTokensMultiset.add(token.getMorphText()); }/*from ww w .j a v a 2 s .c o m*/ Multiset<String> domainMultiset = HashMultiset.create(); Multiset<String> typeMultiset = HashMultiset.create(); HashMap<String, Double> entitySimMap = new HashMap<String, Double>(); // if (printCandidateEntities) { // printEntities(entities); // } HashSet<String> words = new HashSet<String>(); Multiset<String> leskWords = HashMultiset.create(); // first pass for finding number of types and domains for (int i = 0; i < entities.size(); i++) { EntityMatch em = entities.get(i); String id = em.getId(); if (!entityFrequencyMultiset.contains(id)) { entityFrequencyMultiset.add(id); Entity entity = em.getEntity(); words.add(entity.getShingle().getText()); String type = entity.getPage().getType(); if (type != null && type.length() > 0) { typeMultiset.add(type); } String domain = entity.getPage().getDomain(); if (domain != null && domain.length() > 0) { domainMultiset.add(domain); } String desc = entity.getPage().getDescription(); List<Token> tokens = Zemberek.getInstance().disambiguateFindTokens(desc, false, true); for (Token token : tokens) { leskWords.add(token.getMorphText()); } } else { entityFrequencyMultiset.add(id); } } int maxDomainCount = 0; for (String domain : Multisets.copyHighestCountFirst(domainMultiset).elementSet()) { maxDomainCount = domainMultiset.count(domain); break; } int maxTypeCount = 0; for (String type : Multisets.copyHighestCountFirst(typeMultiset).elementSet()) { maxTypeCount = typeMultiset.count(type); break; } double maxSuffixScore = 0, maxLeskScore = 0, maxSimpleLeskScore = 0, maxLinkScore = 0, maxHashInfoboxScore = 0, maxwordvecDescriptionLocalScore = 0, maxHashDescriptionScore = 0, maxPopularityScore = 0, maxWordvectorAverage = 0, maxWordvecLinksScore = 0; // second pass compute similarities between entities in a window int currentSpotIndex = -1; SpotMatch currentSpot = null; for (int i = 0; i < entities.size(); i++) { EntityMatch em = entities.get(i); SpotMatch spot = em.getSpot(); if (currentSpot == null || spot != currentSpot) { currentSpotIndex++; currentSpot = spot; } String id = em.getId(); Entity entity = entities.get(i).getEntity(); EntityPage page = entities.get(i).getEntity().getPage(); String domain = page.getDomain(); String type = page.getType(); Shingle shingle = entity.getShingle(); /* windowing algorithms stars */ int left = currentSpotIndex - window; int right = currentSpotIndex + window; if (left < 0) { right -= left; left = 0; } if (right > sml.size()) { left += (sml.size()) - right; right = sml.size(); if (left < 0) { left = 0; } } double linkScore = 0, hashInfoboxScore = 0, wordvecDescriptionLocalScore = 0, hashDescriptionScore = 0, wordvecLinksScore = 0; for (int j = left; j < right; j++) { SpotMatch sm2 = sml.get(j); EntityMatchList entities2 = sm2.getEntities(); for (EntityMatch em2 : entities2) { String id2 = em2.getId(); EntityPage page2 = em2.getEntity().getPage(); int counter = 0; if (!ambigious.equals("true")) { for (EntityMatch entityMatch : entities2) { if (entityMatch.getId().startsWith("w")) { counter++; } } } if ((ambigious.equals("true") || counter == 1) && em.getSpot() != em2.getSpot() && !id.equals(id2)) { // Link Similarity calculation starts double linkSim = 0; if (id.startsWith("w") && id2.startsWith("w")) { if (entitySimMap.containsKey("link" + id + id2)) { linkSim = entitySimMap.get("link" + id + id2); } else { HashSet<String> set1 = Sets.newHashSet(page.getLinks().split(" ")); HashSet<String> set2 = Sets.newHashSet(page2.getLinks().split(" ")); linkSim = JaccardCalculator.calculateSimilarity(set1, set2); entitySimMap.put("link" + id + id2, linkSim); } linkScore += linkSim; // Link Similarity calculation ends } // Entity embedding similarity calculation starts double eeSim = 0; if (id.startsWith("w") && id2.startsWith("w")) { if (entitySimMap.containsKey("ee" + id + id2)) { eeSim = entitySimMap.get("ee" + id + id2); } else { eeSim = EntityEmbeddingSimilarity.getInstance().getSimilarity(page, page2); entitySimMap.put("ee" + id + id2, eeSim); } hashInfoboxScore += eeSim; } double w2veclinksSim = 0; if (id.startsWith("w") && id2.startsWith("w")) { if (entitySimMap.containsKey("wl" + id + id2)) { w2veclinksSim = entitySimMap.get("wl" + id + id2); } else { w2veclinksSim = AveragePooling.getInstance().getSimilarity(page.getWord2vec(), page2.getWord2vec()); entitySimMap.put("wl" + id + id2, w2veclinksSim); } wordvecLinksScore += w2veclinksSim; } // Entity embedding similarity calculation ends // Description word2vec similarity calculation // starts double word2vecSim = 0; if (entitySimMap.containsKey("w2v" + id + id2)) { word2vecSim = entitySimMap.get("w2v" + id + id2); } else { word2vecSim = AveragePooling.getInstance().getSimilarity(page2.getDword2vec(), page.getDword2vec()); entitySimMap.put("w2v" + id + id2, word2vecSim); } wordvecDescriptionLocalScore += word2vecSim; // Description word2vec similarity calculation ends // Description autoencoder similarity calculation // starts double autoVecSim = 0; if (entitySimMap.containsKey("a2v" + id + id2)) { autoVecSim = entitySimMap.get("a2v" + id + id2); } else { autoVecSim = AveragePooling.getInstance().getSimilarity(page2.getDautoencoder(), page.getDautoencoder()); entitySimMap.put("a2v" + id + id2, autoVecSim); } hashDescriptionScore += autoVecSim; // Description autoencoder similarity calculation // ends } } } if (linkScore > maxLinkScore) { maxLinkScore = linkScore; } if (hashInfoboxScore > maxHashInfoboxScore) { maxHashInfoboxScore = hashInfoboxScore; } if (wordvecDescriptionLocalScore > maxwordvecDescriptionLocalScore) { maxwordvecDescriptionLocalScore = wordvecDescriptionLocalScore; } if (hashDescriptionScore > maxHashDescriptionScore) { maxHashDescriptionScore = hashDescriptionScore; } if (wordvecLinksScore > maxWordvecLinksScore) { maxWordvecLinksScore = wordvecLinksScore; } /* windowing algorithms ends */ double domainScore = 0; if (domainMultiset.size() > 0 && maxDomainCount > 1 && domainMultiset.count(domain) > 1) { domainScore = (double) domainMultiset.count(domain) / maxDomainCount; } double typeScore = 0; if (typeMultiset.size() > 0 && maxTypeCount > 1 && typeMultiset.count(type) > 1) { typeScore = (double) typeMultiset.count(type) / maxTypeCount; } if (typeBlackList.contains(type)) { typeScore /= 10; } double typeContentScore = 0; if (type.length() > 0 && StringUtils.containsIgnoreCase(words.toString(), type)) { typeContentScore = 1; } double typeClassifierScore = TypeClassifier.getInstance().predict(page, page.getTitle(), page.getType(), entity.getShingle().getSentence()); double wordvecDescriptionScore = AveragePooling.getInstance().getSimilarity(documentVector, page.getDword2vec()); if (wordvecDescriptionScore > maxWordvectorAverage) { maxWordvectorAverage = wordvecDescriptionScore; } double suffixScore = 0; if (type != null && type.length() > 0) { Set<String> suffixes = new HashSet<String>(); String t = entity.getTitle().toLowerCase(new Locale("tr", "TR")); for (int x = 0; x < entities.size(); x++) { EntityMatch e2 = entities.get(x); if (e2.getId().equals(entity.getId())) { suffixes.add(e2.getMention()); } } suffixes.remove(t); suffixes.remove(entity.getTitle()); // String inputTextLower = inputText.toLowerCase(new // Locale("tr", // "TR")); // while (inputTextLower.contains(t)) { // int start = inputTextLower.indexOf(t); // int end = inputTextLower.indexOf(" ", start + t.length()); // if (end > start) { // String suffix = inputTextLower.substring(start, end); // // .replaceAll("\\W", ""); // if (suffix.contains("'") // || (Zemberek.getInstance().hasMorph(suffix) // && !suffix.equals(t) && suffix.length() > 4)) { // suffixes.add(suffix); // } // inputTextLower = inputTextLower.substring(end); // } else { // break; // } // } if (suffixes.size() >= minSuffix) { for (String suffix : suffixes) { double sim = gd.calculateSimilarity(suffix, type); suffixScore += sim; } } } // String entitySuffix = page.getSuffix(); // String[] inputSuffix = shingle.getSuffix().split(" "); // for (int j = 0; j < inputSuffix.length; j++) { // if (entitySuffix.contains(inputSuffix[j])) { // suffixScore += 0.25f; // } // } if (suffixScore > maxSuffixScore) { maxSuffixScore = suffixScore; } // if (id.equals("w691538")) { // LOGGER.info(""); // } double letterCaseScore = 0; int lc = page.getLetterCase(); if (StringUtils.isAllLowerCase(em.getMention()) && lc == 0 && id.startsWith("t")) { letterCaseScore = 1; } else if (StringUtils.isAllUpperCase(em.getMention()) && lc == 1 && id.startsWith("w")) { letterCaseScore = 1; } else if (Character.isUpperCase(em.getMention().charAt(0)) && lc == 2 && id.startsWith("w")) { letterCaseScore = 1; } else if (StringUtils.isAllLowerCase(em.getMention()) && id.startsWith("t")) { letterCaseScore = 1; } double nameScore = 1 - LevenshteinDistanceCalculator.calculateDistance(page.getTitle(), Zemberek.removeAfterSpostrophe(em.getMention())); double popularityScore = page.getRank(); if (id.startsWith("w")) { popularityScore = Math.log10(popularityScore + 1); if (popularityScore > maxPopularityScore) { maxPopularityScore = popularityScore; } } double leskScore = 0, simpleLeskScore = 0; String desc = em.getEntity().getPage().getDescription(); if (desc != null) { List<Token> tokens = Zemberek.getInstance().disambiguateFindTokens(desc, false, true); for (Token token : tokens) { if (inputTokensMultiset.contains(token.getMorphText()) && !TurkishNLP.isStopWord(token.getMorphText())) { simpleLeskScore += inputTokensMultiset.count(token.getMorphText()); } if (leskWords.contains(token.getMorphText()) && !TurkishNLP.isStopWord(token.getMorphText())) { leskScore += leskWords.count(token.getMorphText()); } } leskScore /= Math.log(tokens.size() + 1); simpleLeskScore /= Math.log(tokens.size() + 1); if (leskScore > maxLeskScore) { maxLeskScore = leskScore; } if (simpleLeskScore > maxSimpleLeskScore) { maxSimpleLeskScore = simpleLeskScore; } if (!entityScoreMap.containsKey(id)) { EntityScores scores = new EntityScores(em, id, popularityScore, nameScore, letterCaseScore, suffixScore, wordvecDescriptionScore, typeContentScore, typeScore, domainScore, hashDescriptionScore, wordvecDescriptionLocalScore, hashInfoboxScore, linkScore, wordvecLinksScore, leskScore, simpleLeskScore, typeClassifierScore); entityScoreMap.put(id, scores); } else { EntityScores entityScores = entityScoreMap.get(id); entityScores.setHashInfoboxScore((entityScores.getHashInfoboxScore() + hashInfoboxScore) / 2); entityScores.setHashDescriptionScore( (entityScores.getHashInfoboxScore() + hashDescriptionScore) / 2); entityScores.setLinkScore((entityScores.getLinkScore() + linkScore) / 2); entityScores.setWordvecDescriptionLocalScore( (entityScores.getWordvecDescriptionLocalScore() + wordvecDescriptionLocalScore) / 2); entityScores .setWordvecLinksScore((entityScores.getWordvecLinksScore() + wordvecLinksScore) / 2); entityScores.setLeskScore((entityScores.getLeskScore() + leskScore) / 2); } } } /* normalization and total score calculation starts */ Set<String> set = new HashSet<String>(); for (int i = 0; i < entities.size(); i++) { EntityMatch em = entities.get(i); String id = em.getId(); EntityScores entityScores = entityScoreMap.get(id); if (set.contains(id)) { continue; } if (id.startsWith("w")) { if (maxLinkScore > 0 && entityScores.getLinkScore() > 0) { entityScores.setLinkScore(entityScores.getLinkScore() / maxLinkScore); } if (maxHashInfoboxScore > 0 && entityScores.getHashInfoboxScore() > 0) { entityScores.setHashInfoboxScore(entityScores.getHashInfoboxScore() / maxHashInfoboxScore); } if (maxWordvecLinksScore > 0 && entityScores.getWordvecLinksScore() > 0) { entityScores.setWordvecLinksScore(entityScores.getWordvecLinksScore() / maxWordvecLinksScore); } if (maxPopularityScore > 0 && entityScores.getPopularityScore() > 0) { entityScores.setPopularityScore(entityScores.getPopularityScore() / maxPopularityScore); } } if (maxwordvecDescriptionLocalScore > 0 && entityScores.getWordvecDescriptionLocalScore() > 0) { entityScores.setWordvecDescriptionLocalScore( entityScores.getWordvecDescriptionLocalScore() / maxwordvecDescriptionLocalScore); } if (maxHashDescriptionScore > 0 && entityScores.getHashDescriptionScore() > 0) { entityScores .setHashDescriptionScore(entityScores.getHashDescriptionScore() / maxHashDescriptionScore); } if (maxWordvectorAverage > 0 && entityScores.getWordvecDescriptionScore() > 0) { entityScores.setWordvecDescriptionScore( entityScores.getWordvecDescriptionScore() / maxWordvectorAverage); } if (maxLeskScore > 0 && entityScores.getLeskScore() > 0) { entityScores.setLeskScore(entityScores.getLeskScore() / maxLeskScore); } if (maxSimpleLeskScore > 0 && entityScores.getSimpleLeskScore() > 0) { entityScores.setSimpleLeskScore(entityScores.getSimpleLeskScore() / maxSimpleLeskScore); } if (maxSuffixScore > 0 && entityScores.getSuffixScore() > 0) { entityScores.setSuffixScore(entityScores.getSuffixScore() / maxSuffixScore); } set.add(id); } LOGGER.info("\t" + "id\tTitle\tURL\tScore\tPopularity\tName\tLesk\tSimpeLesk\tCase\tNoun\tSuffix\tTypeContent\tType\tDomain\twordvecDescription\twordvecDescriptionLocal\thashDescription\thashInfobox\tword2vecLinks\tLink\t\ttypeClassifier\tDescription"); for (int i = 0; i < entities.size(); i++) { EntityMatch em = entities.get(i); String id = em.getId(); EntityScores e = entityScoreMap.get(id); double wikiScore = 0; if (id.startsWith("w") && Character.isUpperCase(em.getMention().charAt(0))) { wikiScore = wikiWeight; } else if (id.startsWith("t") && Character.isLowerCase(em.getMention().charAt(0))) { wikiScore = wikiWeight; } // if(id.equals("w508792")){ // LOGGER.info(""); // } double totalScore = wikiScore + e.getPopularityScore() * popularityWeight + e.getNameScore() * nameWeight + e.getLeskScore() * leskWeight + e.getSimpleLeskScore() * simpleLeskWeight + e.getLetterCaseScore() * letterCaseWeight + e.getSuffixScore() * suffixWeight + e.getTypeContentScore() * typeContentWeight + e.getTypeScore() * typeWeight + e.getDomainScore() * domainWeight + e.getWordvecDescriptionScore() * wordvecDescriptionWeight + e.getWordvecDescriptionLocalScore() * wordvecDescriptionLocalWeight + e.getHashDescriptionScore() * hashDescriptionWeight + e.getHashInfoboxScore() * hashInfoboxWeight + e.getWordvecLinksScore() * word2vecLinksWeight + e.getLinkScore() * linkWeight + e.getTypeClassifierkScore() * typeClassifierkWeight; if (ranklib == true) { totalScore = RankLib.getInstance().score(e); } if (em.getEntity().getPage().getUrlTitle().contains("(")) { totalScore /= 2; } em.setScore(totalScore); e.setScore(totalScore); LOGGER.info("\t" + id + "\t" + em.getEntity().getPage().getTitle() + "\t" + em.getEntity().getPage().getUrlTitle() + "\t" + em.getScore() + "\t" + e.getPopularityScore() * popularityWeight + "\t" + e.getNameScore() * nameWeight + "\t" + e.getLeskScore() * leskWeight + "\t" + e.getSimpleLeskScore() * simpleLeskWeight + "\t" + e.getLetterCaseScore() * letterCaseWeight + "\t" + e.getSuffixScore() * suffixWeight + "\t" + e.getTypeContentScore() * typeContentWeight + "\t" + e.getTypeScore() * typeWeight + "\t" + e.getDomainScore() * domainWeight + "\t" + e.getWordvecDescriptionScore() * wordvecDescriptionWeight + "\t" + e.getWordvecDescriptionLocalScore() * wordvecDescriptionLocalWeight + "\t" + e.getHashDescriptionScore() * hashDescriptionWeight + "\t" + e.getHashInfoboxScore() * hashInfoboxWeight + "\t" + e.getWordvecLinksScore() * word2vecLinksWeight + "\t" + e.getLinkScore() * linkWeight + "\t" + e.getTypeClassifierkScore() * typeClassifierkWeight + "\t" + em.getEntity().getPage().getDescription()); } // if (annotateEntities) { // annotateEntities(localParams.getParams().get("originalText"), sml); // } EntityMatchList eml = new EntityMatchList(); for (SpotMatch match : sml) { EntityMatchList list = match.getEntities(); if (!list.isEmpty()) { list.sort(); eml.add(list.get(0)); selectedEntities.add(list.get(0).getId()); } } return eml; }
From source file:org.opencb.opencga.storage.mongodb.variant.VariantMongoDBAdaptor.java
/** * Two steps insertion:/* www . ja va 2 s. c om*/ * First check that the variant and study exists making an update. * For those who doesn't exist, pushes a study with the file and genotype information * <p> * The documents that throw a "dup key" exception are those variants that exist and have the study. * Then, only for those variants, make a second update. * <p> * *An interesting idea would be to invert this actions depending on the number of already inserted variants. * * @param data Variants to insert * @param fileId File ID * @param variantConverter Variant converter to be used * @param variantSourceEntryConverter Variant source converter to be used * @param studyConfiguration Configuration for the study * @param loadedSampleIds Other loaded sampleIds EXCEPT those that are going to be loaded * @return QueryResult object */ QueryResult<MongoDBVariantWriteResult> insert(List<Variant> data, int fileId, DocumentToVariantConverter variantConverter, DocumentToStudyVariantEntryConverter variantSourceEntryConverter, StudyConfiguration studyConfiguration, List<Integer> loadedSampleIds) { MongoDBVariantWriteResult writeResult = new MongoDBVariantWriteResult(); long startTime = System.currentTimeMillis(); if (data.isEmpty()) { return new QueryResult<>("insertVariants", 0, 1, 1, "", "", Collections.singletonList(writeResult)); } List<Bson> queries = new ArrayList<>(data.size()); List<Bson> updates = new ArrayList<>(data.size()); // Use a multiset instead of a normal set, to keep tracking of duplicated variants Multiset<String> nonInsertedVariants = HashMultiset.create(); String fileIdStr = Integer.toString(fileId); // List<String> extraFields = studyConfiguration.getAttributes().getAsStringList(VariantStorageManager.Options.EXTRA_GENOTYPE_FIELDS // .key()); boolean excludeGenotypes = studyConfiguration.getAttributes().getBoolean( VariantStorageManager.Options.EXCLUDE_GENOTYPES.key(), VariantStorageManager.Options.EXCLUDE_GENOTYPES.defaultValue()); long nanoTime = System.nanoTime(); Map missingSamples = Collections.emptyMap(); String defaultGenotype = studyConfiguration.getAttributes().getString(DEFAULT_GENOTYPE.key(), ""); if (defaultGenotype.equals(DocumentToSamplesConverter.UNKNOWN_GENOTYPE)) { logger.debug("Do not need fill gaps. DefaultGenotype is UNKNOWN_GENOTYPE({}).", DocumentToSamplesConverter.UNKNOWN_GENOTYPE); } else if (excludeGenotypes) { logger.debug("Do not need fill gaps. Excluding genotypes."); } else if (!loadedSampleIds.isEmpty()) { missingSamples = new Document(DocumentToSamplesConverter.UNKNOWN_GENOTYPE, loadedSampleIds); // ?/? } // List<Object> missingOtherValues = new ArrayList<>(loadedSampleIds.size()); // for (int i = 0; i < loadedSampleIds.size(); i++) { // missingOtherValues.add(DBObjectToSamplesConverter.UNKNOWN_FIELD); // } for (Variant variant : data) { if (variant.getType().equals(VariantType.NO_VARIATION)) { //Storage-MongoDB is not able to store NON VARIANTS writeResult.setSkippedVariants(writeResult.getSkippedVariants() + 1); continue; } else if (variant.getType().equals(VariantType.SYMBOLIC)) { logger.warn("Skip symbolic variant " + variant.toString()); writeResult.setSkippedVariants(writeResult.getSkippedVariants() + 1); continue; } String id = variantConverter.buildStorageId(variant); for (StudyEntry studyEntry : variant.getStudies()) { if (studyEntry.getFiles().size() == 0 || !studyEntry.getFiles().get(0).getFileId().equals(fileIdStr)) { continue; } int studyId = studyConfiguration.getStudyId(); Document study = variantSourceEntryConverter.convertToStorageType(studyEntry); Document genotypes = study.get(DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD, Document.class); if (genotypes != null) { //If genotypes is null, genotypes are not suppose to be loaded genotypes.putAll(missingSamples); //Add missing samples // for (String extraField : extraFields) { // List<Object> otherFieldValues = (List<Object>) study.get(extraField.toLowerCase()); // otherFieldValues.addAll(0, missingOtherValues); // } } Document push = new Document(DocumentToVariantConverter.STUDIES_FIELD, study); Document update = new Document().append("$push", push).append("$setOnInsert", variantConverter.convertToStorageType(variant)); if (variant.getIds() != null && !variant.getIds().isEmpty() && !variant.getIds().iterator().next().isEmpty()) { update.put("$addToSet", new Document(DocumentToVariantConverter.IDS_FIELD, new Document("$each", variant.getIds()))); } // { _id: <variant_id>, "studies.sid": {$ne: <studyId> } } //If the variant exists and contains the study, this find will fail, will try to do the upsert, and throw a // duplicated key exception. queries.add(new Document("_id", id).append( DocumentToVariantConverter.STUDIES_FIELD + "." + DocumentToStudyVariantEntryConverter.STUDYID_FIELD, new Document("$ne", studyId))); updates.add(update); } } // if (!queries.isEmpty()) { QueryOptions options = new QueryOptions(UPSERT, true); options.put(MULTI, false); int newDocuments; int updatedObjects; try { BulkWriteResult bulkWriteResult; bulkWriteResult = variantsCollection.update(queries, updates, options).first(); newDocuments = bulkWriteResult.getUpserts().size(); updatedObjects = bulkWriteResult.getModifiedCount(); } catch (MongoBulkWriteException e) { BulkWriteResult bulkWriteResult; bulkWriteResult = e.getWriteResult(); newDocuments = bulkWriteResult.getUpserts().size(); updatedObjects = bulkWriteResult.getModifiedCount(); for (BulkWriteError writeError : e.getWriteErrors()) { if (writeError.getCode() == 11000) { //Dup Key error code Matcher matcher = writeResultErrorPattern.matcher(writeError.getMessage()); if (matcher.find()) { String id = matcher.group(1); nonInsertedVariants.add(id); } else { throw e; } } else { throw e; } } } writeResult.setNewVariants(newDocuments); writeResult.setUpdatedVariants(updatedObjects); // writeResult.setNewDocuments(data.size() - nonInsertedVariants.size() - writeResult.getSkippedVariants()); queries.clear(); updates.clear(); } writeResult.setNewVariantsNanoTime(System.nanoTime() - nanoTime); nanoTime = System.nanoTime(); for (Variant variant : data) { variant.setAnnotation(null); String id = variantConverter.buildStorageId(variant); if (nonInsertedVariants != null && !nonInsertedVariants.contains(id)) { continue; //Already inserted variant } for (StudyEntry studyEntry : variant.getStudies()) { if (studyEntry.getFiles().size() == 0 || !studyEntry.getFiles().get(0).getFileId().equals(fileIdStr)) { continue; } Document studyObject = variantSourceEntryConverter.convertToStorageType(studyEntry); Document genotypes = studyObject.get(DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD, Document.class); Document push = new Document(); if (!excludeGenotypes) { if (genotypes != null) { //If genotypes is null, genotypes are not suppose to be loaded for (String genotype : genotypes.keySet()) { push.put( DocumentToVariantConverter.STUDIES_FIELD + ".$." + DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD + "." + genotype, new Document("$each", genotypes.get(genotype))); } // for (String extraField : extraFields) { // List values = (List) studyObject.get(extraField.toLowerCase()); // push.put(DBObjectToVariantConverter.STUDIES_FIELD + ".$." + extraField.toLowerCase(), // new Document("$each", values).append("$position", loadedSampleIds.size())); // } } else { push.put( DocumentToVariantConverter.STUDIES_FIELD + ".$." + DocumentToStudyVariantEntryConverter.GENOTYPES_FIELD, Collections.emptyMap()); } } push.put( DocumentToVariantConverter.STUDIES_FIELD + ".$." + DocumentToStudyVariantEntryConverter.FILES_FIELD, ((List) studyObject.get(DocumentToStudyVariantEntryConverter.FILES_FIELD)).get(0)); Document update = new Document(new Document("$push", push)); queries.add(new Document("_id", id) .append(DocumentToVariantConverter.STUDIES_FIELD + '.' + DocumentToStudyVariantEntryConverter.STUDYID_FIELD, studyConfiguration.getStudyId()) .append(DocumentToVariantConverter.STUDIES_FIELD + '.' + DocumentToStudyVariantEntryConverter.FILES_FIELD + '.' + DocumentToStudyVariantEntryConverter.FILEID_FIELD, new Document("$ne", fileId))); updates.add(update); } } writeResult.setExistingVariantsNanoTime(System.nanoTime() - nanoTime); if (!queries.isEmpty()) { QueryOptions options = new QueryOptions(UPSERT, false); options.put(MULTI, false); QueryResult<BulkWriteResult> update = variantsCollection.update(queries, updates, options); // Can happen that nonInsertedVariantsNum != queries.size() != nonInsertedVariants.size() if there was // a duplicated variant. writeResult.setNonInsertedVariants(nonInsertedVariants.size() - update.first().getMatchedCount()); writeResult.setUpdatedVariants(writeResult.getUpdatedVariants() + update.first().getModifiedCount()); } return new QueryResult<>("insertVariants", ((int) (System.currentTimeMillis() - startTime)), 1, 1, "", "", Collections.singletonList(writeResult)); }