List of usage examples for org.apache.commons.math3.stat.descriptive DescriptiveStatistics getMean
public double getMean()
From source file:org.cirdles.ludwig.isoplot3.Means.java
/** * Ludwig's WeightedAverage and assumes ConstExtErr = true since all * possible values are returned and caller can decide * * @param inValues as double[] with length nPts * @param inErrors as double[] with length nPts * @param canReject// w w w . j a v a2 s .co m * @param canTukeys * @return double[7][]{mean, sigmaMean, err68, err95, MSWD, probability, externalFlag}, {values * with rejected as 0.0}. externalFlag = 1.0 for external uncertainty, 0.0 for internal */ public static double[][] weightedAverage(double[] inValues, double[] inErrors, boolean canReject, boolean canTukeys) { double[] values = inValues.clone(); double[] errors = inErrors.clone(); double[][] retVal = new double[][] { { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }, {} }; // check precondition of same size values and errors and at least 3 points int nPts = values.length; int nN = nPts; int count = 0; // where does this come from?? boolean hardRej = false; if ((nPts == errors.length) && nPts > 2) { // proceed double[] inverseVar = new double[nPts]; double[] wtdResid = new double[nPts]; double[] yy; double[] iVarY; double[] tbX = new double[nPts]; double[][] wRejected = new double[nPts][2]; for (int i = 0; i < nPts; i++) { inverseVar[i] = 1.0 / Math.pow(errors[i], 2); } double intMean = 0.0; double MSWD = 0.0; double intSigmaMean = 0.0; double probability = 0.0; double intMeanErr95 = 0.0; double intErr68 = 0.0; double extMean = 0.0; double extMeanErr95 = 0.0; double extMeanErr68 = 0.0; double extSigma = 0.0; double biWtMean = 0.0; double biWtSigma = 0.0; boolean reCalc; // entry point for RECALC goto - consider another private method? do { reCalc = false; extSigma = 0.0; double weight = 0.0; double sumWtdRatios = 0.0; double q = 0.0; count++; for (int i = 0; i < nPts; i++) { if (values[i] * errors[i] != 0.0) { weight += inverseVar[i]; sumWtdRatios += inverseVar[i] * values[i]; q += inverseVar[i] * Math.pow(values[i], 2); } } int nU = nN - 1;// ' Deg. freedom TDistribution studentsT = new TDistribution(nU); // see https://stackoverflow.com/questions/21730285/calculating-t-inverse // for explanation of cutting the tail mass in two to get agreement with Excel two-tail double t68 = Math.abs(studentsT.inverseCumulativeProbability((1.0 - 0.6826) / 2.0)); double t95 = Math.abs(studentsT.inverseCumulativeProbability((1.0 - 0.95) / 2)); intMean = sumWtdRatios / weight;// ' "Internal" error of wtd average double sums = 0.0; for (int i = 0; i < nPts; i++) { if (values[i] * errors[i] != 0.0) { double resid = values[i] - intMean;// ' Simple residual wtdResid[i] = resid / errors[i];// ' Wtd residual double wtdR2 = Math.pow(wtdResid[i], 2);//' Square of wtd residual sums += wtdR2; } } sums = Math.max(sums, 0.0); MSWD = sums / nU;// ' Mean square of weighted deviates intSigmaMean = Math.sqrt(1.0 / weight); // http://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math3/distribution/FDistribution.html FDistribution fdist = new FDistribution(nU, 1E9); probability = 1.0 - fdist.cumulativeProbability(MSWD);// ChiSquare(.MSWD, (nU)) intMeanErr95 = intSigmaMean * (double) (probability >= 0.3 ? 1.96 : t95 * Math.sqrt(MSWD)); intErr68 = intSigmaMean * (double) (probability >= 0.3 ? 0.9998 : t68 * Math.sqrt(MSWD)); extMean = 0.0; extMeanErr95 = 0.0; extMeanErr68 = 0.0; // need to find external uncertainty List<Double> yyList = new ArrayList<>(); List<Double> iVarYList = new ArrayList<>(); if ((probability < SQUID_MINIMUM_PROBABILITY) && (MSWD > 1.0)) { // Find the MLE constant external variance nN = 0; for (int i = 0; i < nPts; i++) { if (values[i] != 0.0) { yyList.add(values[i]); iVarYList.add(errors[i] * errors[i]); nN++; } } // resize arrays yy = yyList.stream().mapToDouble(Double::doubleValue).toArray(); iVarY = iVarYList.stream().mapToDouble(Double::doubleValue).toArray(); // call secant method double[] wtdExtRtsec = wtdExtRTSEC(0, 10.0 * intSigmaMean * intSigmaMean, yy, iVarY); // check for failure if (wtdExtRtsec[3] == 0.0) { extMean = wtdExtRtsec[1]; extSigma = Math.sqrt(wtdExtRtsec[0]); studentsT = new TDistribution(2 * nN - 2); extMeanErr95 = Math.abs(studentsT.inverseCumulativeProbability((1.0 - 0.95) / 2.0)) * wtdExtRtsec[2]; } else if (MSWD > 4.0) { //Failure of RTSEC algorithm because of extremely high MSWD DescriptiveStatistics stats = new DescriptiveStatistics(yy); extSigma = stats.getStandardDeviation(); extMean = stats.getMean(); extMeanErr95 = t95 * extSigma / Math.sqrt(nN); } else { extSigma = 0.0; extMean = 0.0; extMeanErr95 = 0.0; } extMeanErr68 = t68 / t95 * extMeanErr95; } if (canReject && (probability < SQUID_MINIMUM_PROBABILITY)) { // GOSUB REJECT double wtdAvg = 0.0; if (extSigma != 0.0) { wtdAvg = extMean; } else { wtdAvg = intMean; } // reject outliers int n0 = nN; for (int i = 0; i < nPts; i++) { if ((values[i] != 0.0) && (nN > 0.85 * nPts)) { // Reject no more than 30% of ratios // Start rej. tolerance at 2-sigma, increase slightly each pass. double pointError = 2.0 * Math.sqrt(errors[i] * errors[i] + extSigma * extSigma); // 2-sigma error of point being tested double totalError = Math .sqrt(pointError * pointError + (4.0 * extMeanErr68 * extMeanErr68)); // 1st-pass tolerance is 2-sigma; 2nd is 2.25-sigma; 3rd is 2.5-sigma. double tolerance = (1.0 + (double) (count - 1.0) / 4.0) * totalError; if (hardRej) { tolerance = tolerance * 1.25; } // 1st-pass tolerance is 2-sigma; 2nd is 2.5-sigma; 3rd is 3-sigma... q = values[i] - wtdAvg; if ((Math.abs(q) > tolerance) && nN > 2) { nN--; wRejected[i][0] = values[i]; values[i] = 0.0; wRejected[i][1] = errors[i]; errors[i] = 0.0; } // check tolerance } // Reject no more than 30% of ratios } // nPts loop reCalc = (nN < n0); } // canReject test } while (reCalc); if (canTukeys) { // March 2018 not finished as not sure where used System.arraycopy(values, 0, tbX, 0, nPts); double[] tukey = SquidMathUtils.tukeysBiweight(tbX, 6); biWtMean = tukey[0]; biWtSigma = tukey[1]; DescriptiveStatistics stats = new DescriptiveStatistics(tbX); double biWtErr95Median = stats.getPercentile(50); double median = median(tbX); double medianConf = medianConfLevel(nPts); double medianPlusErr = medianUpperLim(tbX) - median; double medianMinusErr = median - medianLowerLim(tbX); } // determine whether to return internal or external if (extMean != 0.0) { retVal = new double[][] { { extMean, extSigma, extMeanErr68, extMeanErr95, MSWD, probability, 1.0 }, // contains zeroes for each reject values }; } else { retVal = new double[][] { { intMean, intSigmaMean, intErr68, intMeanErr95, MSWD, probability, 0.0 }, // contains zeroes for each reject values }; } } return retVal; }
From source file:org.commoncrawl.mapred.ec2.postprocess.deduper.DeduperUtils.java
/** * /* w w w .j a v a2 s. c om*/ * @param args */ public static void main(String[] args) throws IOException { URLFPBloomFilter filter = new URLFPBloomFilter(JSONSetBuilder.NUM_ELEMENTS, JSONSetBuilder.NUM_HASH_FUNCTIONS, JSONSetBuilder.NUM_BITS); DescriptiveStatistics filterClearStats = new DescriptiveStatistics(); for (int i = 0; i < 1000; ++i) { long timeStart = System.nanoTime(); filter.clear(); long timeEnd = System.nanoTime(); filterClearStats.addValue(timeEnd - timeStart); } System.out.println("Mean Clear Time:" + filterClearStats.getMean()); System.out.println("size:" + BINOMIAL_COFF); for (int j = 0; j < BINOMIAL_COFF; ++j) { int value = patternArray[j]; System.out.print("value:" + value + " "); for (int i = 5; i >= 0; --i) { System.out.print(((value & (1 << i)) != 0) ? '1' : '0'); } System.out.print(" Key MSBLen:" + Integer.toString(patternKeyMSBits[j]) + "\n"); } validateGenerator(); long key1 = new BitBuilder().on(10).off(1).on(53).bits(); long key2 = new BitBuilder().on(10).off(4).on(50).bits(); long key3 = new BitBuilder().on(10).off(4).on(47).off(3).bits(); long key4 = new BitBuilder().off(10).on(4).off(47).on(3).bits(); long key5 = new BitBuilder().off(10).on(4).off(47).on(1).off(2).bits(); Assert.assertTrue(SimHash.hammingDistance(key1, key2) == 3); Assert.assertTrue(SimHash.hammingDistance(key1, key3) != 3); Assert.assertTrue(SimHash.hammingDistance(key2, key3) == 3); Assert.assertTrue(SimHash.hammingDistance(key1, key4) > 3); Assert.assertTrue(SimHash.hammingDistance(key2, key4) > 3); Assert.assertTrue(SimHash.hammingDistance(key3, key4) > 3); Assert.assertTrue(SimHash.hammingDistance(key4, key5) <= 3); ImmutableList<DeduperValue> values = new ImmutableList.Builder<DeduperValue>() .add(new DeduperValue(key1, 1000, 2000, IPAddressUtils.IPV4AddressStrToInteger("10.0.0.1"), 1000, new TextBytes("http://adomain.com/"))) .add(new DeduperValue(key2, 1001, 2001, IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"), 1000, new TextBytes("http://bdomain.com/"))) .add(new DeduperValue(key3, 1002, 2002, IPAddressUtils.IPV4AddressStrToInteger("10.0.0.3"), 1000, new TextBytes("http://cdomain.com/"))) .add(new DeduperValue(key4, 1003, 2003, IPAddressUtils.IPV4AddressStrToInteger("10.0.0.4"), 1000, new TextBytes("http://ddomain.com/"))) .add(new DeduperValue(key5, 1004, 2004, IPAddressUtils.IPV4AddressStrToInteger("10.0.0.5"), 1000, new TextBytes("http://edomain.com/"))) .build(); SimhashMatcher unionFinder = new SimhashMatcher(); final Multimap<String, Long> rootDomainToDupes = TreeMultimap.create(); // collect all json set representations ... final ArrayList<TextBytes> jsonSets = new ArrayList<TextBytes>(); unionFinder.emitMatches(3, values.iterator(), new OutputCollector<TextBytes, TextBytes>() { @Override public void collect(TextBytes key, TextBytes value) throws IOException { System.out.println("Root:" + key + " JSON: " + value.toString()); populateTestJSONSetData(rootDomainToDupes, key, value); // collect all json sets for later disjoint-set join jsonSets.add(value); } }, null); ImmutableList<Long> hashSuperSet1 = ImmutableList.of(2000L, 2001L, 2002L); ImmutableList<Long> hashSuperSet2 = ImmutableList.of(2003L, 2004L); Assert.assertTrue(rootDomainToDupes.get("adomain.com").containsAll(hashSuperSet1)); Assert.assertTrue(rootDomainToDupes.get("bdomain.com").containsAll(hashSuperSet1)); Assert.assertTrue(rootDomainToDupes.get("cdomain.com").containsAll(hashSuperSet1)); Assert.assertTrue(rootDomainToDupes.get("ddomain.com").containsAll(hashSuperSet2)); Assert.assertTrue(rootDomainToDupes.get("edomain.com").containsAll(hashSuperSet2)); ImmutableList<DeduperValue> secondSetValues = new ImmutableList.Builder<DeduperValue>() .add(new DeduperValue(key1, 1000, 2000, IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"), 1000, new TextBytes("http://adomain.com/"))) .add(new DeduperValue(key1, 1007, 2007, IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"), 1000, new TextBytes("http://z1domain.com/"))) .add(new DeduperValue(key2, 1008, 2008, IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"), 1000, new TextBytes("http://z2domain.com/"))) .add(new DeduperValue(key3, 1009, 2009, IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"), 1000, new TextBytes("http://z3domain.com/"))) .build(); unionFinder.emitMatches(3, secondSetValues.iterator(), new OutputCollector<TextBytes, TextBytes>() { @Override public void collect(TextBytes key, TextBytes value) throws IOException { System.out.println("Root:" + key + " JSON: " + value.toString()); // collect all json sets for later disjoint-set join jsonSets.add(value); } }, null); SetUnionFinder unionFinder2 = new SetUnionFinder(); // union all json sets ... unionFinder2.union(jsonSets.iterator()); // ok emit union of sets ... unionFinder2.emit(new TextBytes("test"), new OutputCollector<TextBytes, TextBytes>() { @Override public void collect(TextBytes key, TextBytes value) throws IOException { System.out.println("Root:" + key + " JSON: " + value.toString()); } }, null); }
From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.fuzzydedupe.CrossDomainDupesReducer.java
@Override public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { filter.clear();/*from ww w. j a va 2s . c o m*/ double crossDomainDupesCount = 0; double totalHitsCount = 0; double uniqueRootDomainsCount = 0; double uniqueIPs = 0; double validDupePatternMatches = 0; URLFPV2 rootFP = URLUtils.getURLFPV2FromHost(key.toString()); URLFPV2 fp = new URLFPV2(); int sampleCount = 0; ArrayList<Integer> ipAddresses = new ArrayList<Integer>(); JsonArray thisHostsDupes = new JsonArray(); DescriptiveStatistics lengthStats = new DescriptiveStatistics(); while (values.hasNext()) { JsonArray jsonArray = parser.parse(values.next().toString()).getAsJsonArray(); for (JsonElement elem : jsonArray) { totalHitsCount++; fp.setRootDomainHash(elem.getAsJsonObject().get("dh").getAsLong()); if (fp.getRootDomainHash() != rootFP.getRootDomainHash()) { crossDomainDupesCount++; fp.setDomainHash(fp.getRootDomainHash()); fp.setUrlHash(fp.getRootDomainHash()); // track length average .... lengthStats.addValue(elem.getAsJsonObject().get("length").getAsInt()); if (!filter.isPresent(fp)) { uniqueRootDomainsCount++; filter.add(fp); if (sampleCount < samples.length) { String url = elem.getAsJsonObject().get("url").getAsString(); GoogleURL urlObject = new GoogleURL(url); if (knownValidDupesPatterns.matcher(urlObject.getCanonicalURL()).find()) { validDupePatternMatches++; } samples[sampleCount++] = url; } } } else { thisHostsDupes.add(elem); } int ipAddress = elem.getAsJsonObject().get("ip").getAsInt(); fp.setRootDomainHash(ipAddress); fp.setDomainHash(ipAddress); fp.setUrlHash(ipAddress); if (!filter.isPresent(fp)) { uniqueIPs++; filter.add(fp); ipAddresses.add(ipAddress); } } } if (totalHitsCount > 15 && crossDomainDupesCount >= 2) { double otherDomainToLocalScore = otherDomainToLocalDomainScore(totalHitsCount, crossDomainDupesCount); double spamIPScore = spamHostScore(totalHitsCount, crossDomainDupesCount, uniqueIPs); if (otherDomainToLocalScore >= .50 || spamIPScore > .50) { JsonObject objectOut = new JsonObject(); objectOut.addProperty("ratio", (crossDomainDupesCount / totalHitsCount)); objectOut.addProperty("totalHits", totalHitsCount); objectOut.addProperty("crossDomainDupes", crossDomainDupesCount); objectOut.addProperty("uniqueRootDomains", uniqueRootDomainsCount); objectOut.addProperty("otherDomainToLocalScore", otherDomainToLocalScore); objectOut.addProperty("spamIPScore", spamIPScore); objectOut.addProperty("validDupeMatches", validDupePatternMatches); objectOut.addProperty("content-len-mean", lengthStats.getMean()); objectOut.addProperty("content-len-geo-mean", lengthStats.getGeometricMean()); for (int i = 0; i < sampleCount; ++i) { objectOut.addProperty("sample-" + i, samples[i]); } // compute path edit distance ... if (sampleCount > 1) { int sampleEditDistanceSize = Math.min(sampleCount, 5); DescriptiveStatistics stats = new DescriptiveStatistics(); for (int j = 0; j < sampleEditDistanceSize; ++j) { for (int k = 0; k < sampleEditDistanceSize; ++k) { if (k != j) { GoogleURL urlObjectA = new GoogleURL(samples[j]); GoogleURL urlObjectB = new GoogleURL(samples[k]); if (urlObjectA.getPath().length() < 100 && urlObjectB.getPath().length() < 100) { stats.addValue(StringUtils.getLevenshteinDistance(urlObjectA.getPath(), urlObjectB.getPath())); } } } } if (stats.getMean() != 0.0) { objectOut.addProperty("lev-distance-mean", stats.getMean()); objectOut.addProperty("lev-distance-geomean", stats.getGeometricMean()); } } JsonArray ipAddressArray = new JsonArray(); for (int j = 0; j < Math.min(1000, ipAddresses.size()); ++j) { ipAddressArray.add(new JsonPrimitive(ipAddresses.get(j))); } if (ipAddresses.size() != 0) { objectOut.add("ipList", ipAddressArray); } objectOut.add("thisHostDupes", thisHostsDupes); output.collect(key, new TextBytes(objectOut.toString())); } } }
From source file:org.cse.visiri.app.algoevaluation.DistributionEval.java
public void EvaluateDistribution(QueryDistribution dist, String algoname) { Map<String, Integer> allInfo = new TreeMap<String, Integer>(); Map<String, Integer> nodeInfo = new TreeMap<String, Integer>(); Map<String, Double> nodeCosts = new TreeMap<String, Double>(); for (Query q : dist.getQueryAllocation().keySet()) { String node = dist.getQueryAllocation().get(q); if (!allInfo.containsKey(node)) { allInfo.put(node, 0);/*ww w . j av a 2 s.c om*/ } int val = allInfo.get(node); allInfo.put(node, val + 1); if (node.startsWith(NODE_PREFIX)) { if (!nodeInfo.containsKey(node)) { nodeInfo.put(node, 0); nodeCosts.put(node, 0.0); } val = nodeInfo.get(node); nodeInfo.put(node, val + 1); nodeCosts.put(node, nodeCosts.get(node) + q.getCost()); } } DescriptiveStatistics stat = new DescriptiveStatistics(); DescriptiveStatistics costStat = new DescriptiveStatistics(); System.out.println("Query counts : "); for (String node : nodeInfo.keySet()) { System.out.println(node + " : " + allInfo.get(node)); stat.addValue(nodeInfo.get(node)); costStat.addValue(nodeCosts.get(node)); } System.out.println(); double mean = stat.getMean(); double stdDev = Math.sqrt(stat.getPopulationVariance()); double varCoef = stdDev / mean; System.out.println("mean : " + mean); System.out.println("stdDev : " + stdDev); System.out.println("Coefficient of var : " + varCoef); System.out.println("\nCosts :"); mean = costStat.getMean(); stdDev = Math.sqrt(costStat.getPopulationVariance()); varCoef = stdDev / mean; System.out.println("mean : " + mean); System.out.println("stdDev : " + stdDev); System.out.println("Coefficient of var : " + varCoef); //calculate event duplication Map<String, Set<String>> eventMap = new TreeMap<String, Set<String>>(); for (Query q : dist.getQueryAllocation().keySet()) { String targetNode = dist.getQueryAllocation().get(q); for (StreamDefinition def : q.getInputStreamDefinitionsList()) { if (!eventMap.containsKey(def.getStreamId())) { eventMap.put(def.getStreamId(), new HashSet<String>()); } eventMap.get(def.getStreamId()).add(targetNode); } } stat = new DescriptiveStatistics(); for (Set<String> nodes : eventMap.values()) { stat.addValue(nodes.size()); } double avg = stat.getMean(); System.out.println(); System.out.println("Avg. event duplication " + avg); try { PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter("VISIRI_algoeval.txt", true))); out.println(stdDev); out.close(); } catch (IOException e) { e.printStackTrace(); } try { PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter("VISIRI_eventDup.txt", true))); out.println(avg); out.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:org.deidentifier.arx.aggregates.StatisticsBuilder.java
/** * Returns summary statistics for all attributes. * /*from www . j a v a 2 s .c om*/ * @param listwiseDeletion A flag enabling list-wise deletion * @return */ @SuppressWarnings({ "unchecked", "rawtypes" }) public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) { // Reset stop flag interrupt.value = false; Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>(); Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>(); Map<String, DataScale> scales = new HashMap<String, DataScale>(); Map<String, GeometricMean> geomean = new HashMap<String, GeometricMean>(); // Detect scales for (int col = 0; col < handle.getNumColumns(); col++) { // Meta String attribute = handle.getAttributeName(col); DataType<?> type = handle.getDataType(attribute); // Scale DataScale scale = type.getDescription().getScale(); // Try to replace nominal scale with ordinal scale based on base data type if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) { if (!(handle.getBaseDataType(attribute) instanceof ARXString) && getHierarchy(col, true) != null) { scale = DataScale.ORDINAL; } } // Store scales.put(attribute, scale); statistics.put(attribute, new DescriptiveStatistics()); geomean.put(attribute, new GeometricMean()); ordinal.put(attribute, getSummaryStatisticsOrdinal(handle.getGeneralization(attribute), handle.getDataType(attribute), handle.getBaseDataType(attribute), getHierarchy(col, true))); } // Compute summary statistics for (int row = 0; row < handle.getNumRows(); row++) { // Check, if we should include this row boolean include = true; if (listwiseDeletion) { for (int col = 0; col < handle.getNumColumns(); col++) { if (handle.isOutlier(row) || DataType.isNull(handle.getValue(row, col))) { include = false; break; } } } // Check checkInterrupt(); // If yes, add if (include) { // For each column for (int col = 0; col < handle.getNumColumns(); col++) { // Meta String value = handle.getValue(row, col); String attribute = handle.getAttributeName(col); DataType<?> type = handle.getDataType(attribute); // Analyze if (!DataType.isAny(value) && !DataType.isNull(value)) { ordinal.get(attribute).addValue(value); if (type instanceof DataTypeWithRatioScale) { double doubleValue = ((DataTypeWithRatioScale) type).toDouble(type.parse(value)); statistics.get(attribute).addValue(doubleValue); geomean.get(attribute).increment(doubleValue + 1d); } } } } } // Convert Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>(); for (int col = 0; col < handle.getNumColumns(); col++) { // Check checkInterrupt(); // Depending on scale String attribute = handle.getAttributeName(col); DataScale scale = scales.get(attribute); DataType<T> type = (DataType<T>) handle.getDataType(attribute); ordinal.get(attribute).analyze(); if (scale == DataScale.NOMINAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); result.put(attribute, new StatisticsSummary<T>(DataScale.NOMINAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()))); } else if (scale == DataScale.ORDINAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); result.put(attribute, new StatisticsSummary<T>(DataScale.ORDINAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()))); } else if (scale == DataScale.INTERVAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); DescriptiveStatistics stats2 = statistics.get(attribute); boolean isPeriod = type.getDescription().getWrappedClass() == Date.class; // TODO: Something is wrong with commons math's kurtosis double kurtosis = stats2.getKurtosis(); kurtosis = kurtosis < 0d ? Double.NaN : kurtosis; double range = stats2.getMax() - stats2.getMin(); double stddev = Math.sqrt(stats2.getVariance()); result.put(attribute, new StatisticsSummary<T>(DataScale.INTERVAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false), toValue(type, stats2.getMean()), stats2.getMean(), toString(type, stats2.getVariance(), isPeriod, true), toValue(type, stats2.getVariance()), stats2.getVariance(), toString(type, stats2.getPopulationVariance(), isPeriod, true), toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(), toString(type, stddev, isPeriod, false), toValue(type, stddev), stddev, toString(type, range, isPeriod, false), toValue(type, range), stats2.getMax() - stats2.getMin(), toString(type, kurtosis, isPeriod, false), toValue(type, kurtosis), kurtosis)); } else if (scale == DataScale.RATIO) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); DescriptiveStatistics stats2 = statistics.get(attribute); GeometricMean geo = geomean.get(attribute); // TODO: Something is wrong with commons math's kurtosis double kurtosis = stats2.getKurtosis(); kurtosis = kurtosis < 0d ? Double.NaN : kurtosis; double range = stats2.getMax() - stats2.getMin(); double stddev = Math.sqrt(stats2.getVariance()); result.put(attribute, new StatisticsSummary<T>(DataScale.RATIO, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false), toValue(type, stats2.getMean()), stats2.getMean(), toString(type, stats2.getVariance(), false, false), toValue(type, stats2.getVariance()), stats2.getVariance(), toString(type, stats2.getPopulationVariance(), false, false), toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(), toString(type, stddev, false, false), toValue(type, stddev), stddev, toString(type, range, false, false), toValue(type, range), range, toString(type, kurtosis, false, false), toValue(type, kurtosis), kurtosis, toString(type, geo.getResult() - 1d, false, false), toValue(type, geo.getResult() - 1d), stats2.getGeometricMean())); } } return result; }
From source file:org.dllearner.algorithms.qtl.experiments.BenchmarkDescriptionGeneratorHTML.java
@Override protected void addRow(QueryData queryData) { sb.append("<tr>\n"); // column: ID sb.append("<td>" + queryData.id + "</td>\n"); // column: SPARQL query sb.append("<td><pre>" + queryData.query.toString().replace("<", "<").replace(">", ">") + "</pre></td>\n"); // column: SPARQL query type sb.append("<td>" + queryData.queryType + "</td>\n"); // query graph // QueryToGraphExporter.exportYedGraph(queryData.query, new File("")); // sb.append("<td><img src=\"" + graphFile.getPath() + "\" alt=\"query graph\"></td>\n"); // column: depth sb.append("<td class='number'>" + queryData.maxTreeDepth + "</td>\n"); // column: #instances sb.append("<td class='number'>" + queryData.nrOfInstances + "</td>\n"); // columns: optimal CBD sizes (min, max, avg) DescriptiveStatistics optimalCBDSizeStats = queryData.optimalCBDSizeStats; sb.append("<td class='number'>" + (int) optimalCBDSizeStats.getMin() + "</td>\n"); sb.append("<td class='number'>" + (int) optimalCBDSizeStats.getMax() + "</td>\n"); sb.append("<td class='number'>" + (int) optimalCBDSizeStats.getMean() + "</td>\n"); // columns: generic CBD sizes (min, max, avg) DescriptiveStatistics genericCBDSizeStats = queryData.defaultCBDSizesStats; sb.append("<td class='number'>" + (int) genericCBDSizeStats.getMin() + "</td>\n"); sb.append("<td class='number'>" + (int) genericCBDSizeStats.getMax() + "</td>\n"); sb.append("<td class='number'>" + (int) genericCBDSizeStats.getMean() + "</td>\n"); sb.append("</tr>\n"); }
From source file:org.dllearner.algorithms.qtl.experiments.EvaluationDataset.java
public void analyze() { ConciseBoundedDescriptionGenerator cbdGen = new SymmetricConciseBoundedDescriptionGeneratorImpl( ks.getQueryExecutionFactory()); String separator = "\t"; String tsv = sparqlQueries.entrySet().stream().map(entry -> { StringBuilder sb = new StringBuilder(); // ID//from w w w. j a v a 2 s .c o m String id = entry.getKey(); sb.append(id).append(separator); // query Query q = entry.getValue(); sb.append(q.toString().replace("\n", " ")); try { // get query result List<String> result = SPARQLUtils.getResult(ks.getQueryExecutionFactory(), q); sb.append(separator).append(result.size()); // query type SPARQLUtils.QueryType queryType = SPARQLUtils.getQueryType(q); sb.append(separator).append(queryType.name()); // check CBD sizes and time Monitor mon = MonitorFactory.getTimeMonitor("CBD"); mon.reset(); DescriptiveStatistics sizeStats = new DescriptiveStatistics(); result.stream().map(r -> { System.out.println(r); mon.start(); Model cbd = cbdGen.getConciseBoundedDescription(r, 2); mon.stop(); return cbd; }).map(Model::size).forEach(sizeStats::addValue); // show min., max. and avg. size sb.append(separator).append(sizeStats.getMin()); sb.append(separator).append(sizeStats.getMax()); sb.append(separator).append(sizeStats.getMean()); // show min., max. and avg. CBD time sb.append(separator).append(mon.getTotal()); sb.append(separator).append(mon.getMin()); sb.append(separator).append(mon.getMax()); sb.append(separator).append(mon.getAvg()); } catch (Exception e) { e.printStackTrace(); } return sb; }).collect(Collectors.joining("\n")); System.out.println(tsv); }
From source file:org.dllearner.algorithms.qtl.experiments.PRConvergenceExperiment.java
public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInterval, double[] noiseInterval, HeuristicType[] measures) throws Exception { this.maxTreeDepth = maxTreeDepth; queryTreeFactory.setMaxDepth(maxTreeDepth); if (exampleInterval != null) { nrOfExamplesIntervals = exampleInterval; }/* w ww. ja v a2 s. c om*/ if (noiseInterval != null) { this.noiseIntervals = noiseInterval; } if (measures != null) { this.measures = measures; } boolean noiseEnabled = noiseIntervals.length > 1 || noiseInterval[0] > 0; boolean posOnly = noiseEnabled ? false : true; logger.info("Started QTL evaluation..."); long t1 = System.currentTimeMillis(); List<String> queries = dataset.getSparqlQueries().values().stream().map(q -> q.toString()) .collect(Collectors.toList()); logger.info("#loaded queries: " + queries.size()); // filter for debugging purposes queries = queries.stream().filter(q -> queriesToProcessTokens.stream().noneMatch(t -> !q.contains(t))) .collect(Collectors.toList()); queries = queries.stream().filter(q -> queriesToOmitTokens.stream().noneMatch(t -> q.contains(t))) .collect(Collectors.toList()); if (maxNrOfProcessedQueries == -1) { maxNrOfProcessedQueries = queries.size(); } // queries = filter(queries, (int) Math.ceil((double) maxNrOfProcessedQueries / maxTreeDepth)); // queries = queries.subList(0, Math.min(queries.size(), maxNrOfProcessedQueries)); logger.info("#queries to process: " + queries.size()); // generate examples for each query logger.info("precomputing pos. and neg. examples..."); for (String query : queries) {//if(!(query.contains("Borough_(New_York_City)")))continue; query2Examples.put(query, generateExamples(query, posOnly, noiseEnabled)); } logger.info("precomputing pos. and neg. examples finished."); // check for queries that do not return any result (should not happen, but we never know) Set<String> emptyQueries = query2Examples.entrySet().stream() .filter(e -> e.getValue().correctPosExampleCandidates.isEmpty()).map(e -> e.getKey()) .collect(Collectors.toSet()); logger.info("got {} empty queries.", emptyQueries.size()); queries.removeAll(emptyQueries); // min. pos examples int min = 3; Set<String> lowNrOfExamplesQueries = query2Examples.entrySet().stream() .filter(e -> e.getValue().correctPosExampleCandidates.size() < min).map(e -> e.getKey()) .collect(Collectors.toSet()); logger.info("got {} queries with < {} pos. examples.", emptyQueries.size(), min); queries.removeAll(lowNrOfExamplesQueries); queries = queries.subList(0, Math.min(80, queries.size())); final int totalNrOfQTLRuns = heuristics.length * this.measures.length * nrOfExamplesIntervals.length * noiseIntervals.length * queries.size(); logger.info("#QTL runs: " + totalNrOfQTLRuns); final AtomicInteger currentNrOfFinishedRuns = new AtomicInteger(0); // loop over heuristics for (final QueryTreeHeuristic heuristic : heuristics) { final String heuristicName = heuristic.getClass().getAnnotation(ComponentAnn.class).shortName(); // loop over heuristics measures for (HeuristicType measure : this.measures) { final String measureName = measure.toString(); heuristic.setHeuristicType(measure); double[][] data = new double[nrOfExamplesIntervals.length][noiseIntervals.length]; // loop over number of positive examples for (int i = 0; i < nrOfExamplesIntervals.length; i++) { final int nrOfExamples = nrOfExamplesIntervals[i]; // loop over noise value for (int j = 0; j < noiseIntervals.length; j++) { final double noise = noiseIntervals[j]; // check if not already processed File logFile = new File(benchmarkDirectory, "qtl2-" + nrOfExamples + "-" + noise + "-" + heuristicName + "-" + measureName + ".log"); File statsFile = new File(benchmarkDirectory, "qtl2-" + nrOfExamples + "-" + noise + "-" + heuristicName + "-" + measureName + ".stats"); if (!override && logFile.exists() && statsFile.exists()) { logger.info( "Eval config already processed. For re-running please remove corresponding output files."); continue; } FileAppender appender = null; try { appender = new FileAppender(new SimpleLayout(), logFile.getPath(), false); Logger.getRootLogger().addAppender(appender); } catch (IOException e) { e.printStackTrace(); } logger.info("#examples: " + nrOfExamples + " noise: " + noise); final DescriptiveStatistics nrOfReturnedSolutionsStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselinePrecisionStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselineRecallStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselineFMeasureStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselinePredAccStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselineMathCorrStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionPrecisionStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionRecallStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionFMeasureStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionPredAccStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionMathCorrStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionRuntimeStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionPrecisionStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionRecallStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionFMeasureStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionPredAccStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionMathCorrStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionPositionStats = new SynchronizedDescriptiveStatistics(); MonitorFactory.getTimeMonitor(TimeMonitors.CBD_RETRIEVAL.name()).reset(); MonitorFactory.getTimeMonitor(TimeMonitors.TREE_GENERATION.name()).reset(); ExecutorService tp = Executors.newFixedThreadPool(nrOfThreads); // indicates if the execution for some of the queries failed final AtomicBoolean failed = new AtomicBoolean(false); Set<String> queriesToProcess = new TreeSet<>(queries); queriesToProcess.retainAll(query2Examples.entrySet().stream() .filter(e -> e.getValue().correctPosExampleCandidates.size() >= nrOfExamples) .map(e -> e.getKey()).collect(Collectors.toSet())); // loop over SPARQL queries for (final String sparqlQuery : queriesToProcess) { CBDStructureTree cbdStructure = cbdStructureTree != null ? cbdStructureTree : QueryUtils.getOptimalCBDStructure(QueryFactory.create(sparqlQuery)); tp.submit(() -> { logger.info("CBD tree:" + cbdStructure.toStringVerbose()); // update max tree depth this.maxTreeDepth = QueryTreeUtils.getDepth(cbdStructure); logger.info("##############################################################"); logger.info("Processing query\n" + sparqlQuery); // we repeat it n times with different permutations of examples int nrOfPermutations = 1; if (nrOfExamples >= query2Examples.get(sparqlQuery).correctPosExampleCandidates .size()) { nrOfPermutations = 1; } for (int perm = 1; perm <= nrOfPermutations; perm++) { logger.info("Run {}/{}", perm, nrOfPermutations); try { ExamplesWrapper examples = getExamples(sparqlQuery, nrOfExamples, nrOfExamples, noise, cbdStructure); logger.info("pos. examples:\n" + Joiner.on("\n").join(examples.correctPosExamples)); logger.info("neg. examples:\n" + Joiner.on("\n").join(examples.correctNegExamples)); // write examples to disk File dir = new File(benchmarkDirectory, "data/" + hash(sparqlQuery)); dir.mkdirs(); Files.write(Joiner.on("\n").join(examples.correctPosExamples), new File(dir, "examples" + perm + "_" + nrOfExamples + "_" + noise + ".tp"), Charsets.UTF_8); Files.write(Joiner.on("\n").join(examples.correctNegExamples), new File(dir, "examples" + perm + "_" + nrOfExamples + "_" + noise + ".tn"), Charsets.UTF_8); Files.write(Joiner.on("\n").join(examples.falsePosExamples), new File(dir, "examples" + perm + "_" + nrOfExamples + "_" + noise + ".fp"), Charsets.UTF_8); // compute baseline RDFResourceTree baselineSolution = applyBaseLine(examples, Baseline.MOST_INFORMATIVE_EDGE_IN_EXAMPLES); logger.info("Evaluating baseline..."); Score baselineScore = computeScore(sparqlQuery, baselineSolution, noise); logger.info("Baseline score:\n" + baselineScore); String baseLineQuery = QueryTreeUtils.toSPARQLQueryString(baselineSolution, dataset.getBaseIRI(), dataset.getPrefixMapping()); baselinePrecisionStats.addValue(baselineScore.precision); baselineRecallStats.addValue(baselineScore.recall); baselineFMeasureStats.addValue(baselineScore.fmeasure); baselinePredAccStats.addValue(baselineScore.predAcc); baselineMathCorrStats.addValue(baselineScore.mathCorr); // run QTL PosNegLPStandard lp = new PosNegLPStandard(); lp.setPositiveExamples(examples.posExamplesMapping.keySet()); lp.setNegativeExamples(examples.negExamplesMapping.keySet()); // QTL2Disjunctive la = new QTL2Disjunctive(lp, qef); QTL2DisjunctiveMultiThreaded la = new QTL2DisjunctiveMultiThreaded(lp, qef); la.setRenderer(new org.dllearner.utilities.owl.DLSyntaxObjectRenderer()); la.setReasoner(dataset.getReasoner()); la.setEntailment(Entailment.SIMPLE); la.setTreeFactory(queryTreeFactory); la.setPositiveExampleTrees(examples.posExamplesMapping); la.setNegativeExampleTrees(examples.negExamplesMapping); la.setNoise(noise); la.setHeuristic(heuristic); la.setMaxExecutionTimeInSeconds(maxExecutionTimeInSeconds); la.setMaxTreeComputationTimeInSeconds(maxExecutionTimeInSeconds); la.init(); la.start(); List<EvaluatedRDFResourceTree> solutions = new ArrayList<>( la.getSolutions()); // List<EvaluatedRDFResourceTree> solutions = generateSolutions(examples, noise, heuristic); nrOfReturnedSolutionsStats.addValue(solutions.size()); // the best returned solution by QTL EvaluatedRDFResourceTree bestSolution = solutions.get(0); logger.info("Got " + solutions.size() + " query trees."); // logger.info("Best computed solution:\n" + render(bestSolution.asEvaluatedDescription())); logger.info("QTL Score:\n" + bestSolution.getTreeScore()); long runtimeBestSolution = la.getTimeBestSolutionFound(); bestReturnedSolutionRuntimeStats.addValue(runtimeBestSolution); // convert to SPARQL query RDFResourceTree tree = bestSolution.getTree(); tree = filter.apply(tree); String learnedSPARQLQuery = QueryTreeUtils.toSPARQLQueryString(tree, dataset.getBaseIRI(), dataset.getPrefixMapping()); // compute score Score score = computeScore(sparqlQuery, tree, noise); bestReturnedSolutionPrecisionStats.addValue(score.precision); bestReturnedSolutionRecallStats.addValue(score.recall); bestReturnedSolutionFMeasureStats.addValue(score.fmeasure); bestReturnedSolutionPredAccStats.addValue(score.predAcc); bestReturnedSolutionMathCorrStats.addValue(score.mathCorr); logger.info(score.toString()); // find the extensionally best matching tree in the list Pair<EvaluatedRDFResourceTree, Score> bestMatchingTreeWithScore = findBestMatchingTreeFast( solutions, sparqlQuery, noise, examples); EvaluatedRDFResourceTree bestMatchingTree = bestMatchingTreeWithScore .getFirst(); Score bestMatchingScore = bestMatchingTreeWithScore.getSecond(); // position of best tree in list of solutions int positionBestScore = solutions.indexOf(bestMatchingTree); bestSolutionPositionStats.addValue(positionBestScore); Score bestScore = score; if (positionBestScore > 0) { logger.info( "Position of best covering tree in list: " + positionBestScore); logger.info("Best covering solution:\n" + render(bestMatchingTree.asEvaluatedDescription())); logger.info("Tree score: " + bestMatchingTree.getTreeScore()); bestScore = bestMatchingScore; logger.info(bestMatchingScore.toString()); } else { logger.info( "Best returned solution was also the best covering solution."); } bestSolutionRecallStats.addValue(bestScore.recall); bestSolutionPrecisionStats.addValue(bestScore.precision); bestSolutionFMeasureStats.addValue(bestScore.fmeasure); bestSolutionPredAccStats.addValue(bestScore.predAcc); bestSolutionMathCorrStats.addValue(bestScore.mathCorr); for (RDFResourceTree negTree : examples.negExamplesMapping.values()) { if (QueryTreeUtils.isSubsumedBy(negTree, bestMatchingTree.getTree())) { Files.append(sparqlQuery + "\n", new File("/tmp/negCovered.txt"), Charsets.UTF_8); break; } } String bestQuery = QueryFactory .create(QueryTreeUtils.toSPARQLQueryString( filter.apply(bestMatchingTree.getTree()), dataset.getBaseIRI(), dataset.getPrefixMapping())) .toString(); if (write2DB) { write2DB(sparqlQuery, nrOfExamples, examples, noise, baseLineQuery, baselineScore, heuristicName, measureName, QueryFactory.create(learnedSPARQLQuery).toString(), score, runtimeBestSolution, bestQuery, positionBestScore, bestScore); } } catch (Exception e) { failed.set(true); logger.error("Error occured for query\n" + sparqlQuery, e); try { StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); e.printStackTrace(pw); Files.append(sparqlQuery + "\n" + sw.toString(), new File(benchmarkDirectory, "failed-" + nrOfExamples + "-" + noise + "-" + heuristicName + "-" + measureName + ".txt"), Charsets.UTF_8); } catch (IOException e1) { e1.printStackTrace(); } } finally { int cnt = currentNrOfFinishedRuns.incrementAndGet(); logger.info("***********Evaluation Progress:" + NumberFormat.getPercentInstance() .format((double) cnt / totalNrOfQTLRuns) + "(" + cnt + "/" + totalNrOfQTLRuns + ")" + "***********"); } } }); } tp.shutdown(); tp.awaitTermination(12, TimeUnit.HOURS); Logger.getRootLogger().removeAppender(appender); if (!failed.get()) { String result = ""; result += "\nBaseline Precision:\n" + baselinePrecisionStats; result += "\nBaseline Recall:\n" + baselineRecallStats; result += "\nBaseline F-measure:\n" + baselineFMeasureStats; result += "\nBaseline PredAcc:\n" + baselinePredAccStats; result += "\nBaseline MathCorr:\n" + baselineMathCorrStats; result += "#Returned solutions:\n" + nrOfReturnedSolutionsStats; result += "\nOverall Precision:\n" + bestReturnedSolutionPrecisionStats; result += "\nOverall Recall:\n" + bestReturnedSolutionRecallStats; result += "\nOverall F-measure:\n" + bestReturnedSolutionFMeasureStats; result += "\nOverall PredAcc:\n" + bestReturnedSolutionPredAccStats; result += "\nOverall MathCorr:\n" + bestReturnedSolutionMathCorrStats; result += "\nTime until best returned solution found:\n" + bestReturnedSolutionRuntimeStats; result += "\nPositions of best solution:\n" + Arrays.toString(bestSolutionPositionStats.getValues()); result += "\nPosition of best solution stats:\n" + bestSolutionPositionStats; result += "\nOverall Precision of best solution:\n" + bestSolutionPrecisionStats; result += "\nOverall Recall of best solution:\n" + bestSolutionRecallStats; result += "\nOverall F-measure of best solution:\n" + bestSolutionFMeasureStats; result += "\nCBD generation time(total):\t" + MonitorFactory.getTimeMonitor(TimeMonitors.CBD_RETRIEVAL.name()).getTotal() + "\n"; result += "CBD generation time(avg):\t" + MonitorFactory.getTimeMonitor(TimeMonitors.CBD_RETRIEVAL.name()).getAvg() + "\n"; result += "Tree generation time(total):\t" + MonitorFactory.getTimeMonitor(TimeMonitors.TREE_GENERATION.name()).getTotal() + "\n"; result += "Tree generation time(avg):\t" + MonitorFactory.getTimeMonitor(TimeMonitors.TREE_GENERATION.name()).getAvg() + "\n"; result += "Tree size(avg):\t" + treeSizeStats.getMean() + "\n"; logger.info(result); try { Files.write(result, statsFile, Charsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } data[i][j] = bestReturnedSolutionFMeasureStats.getMean(); if (write2DB) { write2DB(heuristicName, measureName, nrOfExamples, noise, bestReturnedSolutionFMeasureStats.getMean(), bestReturnedSolutionPrecisionStats.getMean(), bestReturnedSolutionRecallStats.getMean(), bestReturnedSolutionPredAccStats.getMean(), bestReturnedSolutionMathCorrStats.getMean(), bestSolutionPositionStats.getMean(), bestSolutionFMeasureStats.getMean(), bestSolutionPrecisionStats.getMean(), bestSolutionRecallStats.getMean(), bestSolutionPredAccStats.getMean(), bestSolutionMathCorrStats.getMean(), baselineFMeasureStats.getMean(), baselinePrecisionStats.getMean(), baselineRecallStats.getMean(), baselinePredAccStats.getMean(), baselineMathCorrStats.getMean(), bestReturnedSolutionRuntimeStats.getMean()); } } } } String content = "###"; String separator = "\t"; for (double noiseInterval1 : noiseIntervals) { content += separator + noiseInterval1; } content += "\n"; for (int i = 0; i < nrOfExamplesIntervals.length; i++) { content += nrOfExamplesIntervals[i]; for (int j = 0; j < noiseIntervals.length; j++) { content += separator + data[i][j]; } content += "\n"; } File examplesVsNoise = new File(benchmarkDirectory, "examplesVsNoise-" + heuristicName + "-" + measureName + ".tsv"); try { Files.write(content, examplesVsNoise, Charsets.UTF_8); } catch (IOException e) { logger.error("failed to write stats to file", e); } } } if (write2DB) { conn.close(); } if (useEmailNotification) { sendFinishedMail(); } long t2 = System.currentTimeMillis(); long duration = t2 - t1; logger.info("QTL evaluation finished in " + DurationFormatUtils.formatDurationHMS(duration) + "ms."); }
From source file:org.dllearner.algorithms.qtl.experiments.QTLEvaluation.java
public void run(int maxNrOfProcessedQueries, int maxTreeDepth, int[] exampleInterval, double[] noiseInterval, HeuristicType[] measures) throws Exception { this.maxTreeDepth = maxTreeDepth; queryTreeFactory.setMaxDepth(maxTreeDepth); if (exampleInterval != null) { nrOfExamplesIntervals = exampleInterval; }//from w ww. j ava2 s . c o m if (noiseInterval != null) { this.noiseIntervals = noiseInterval; } if (measures != null) { this.measures = measures; } logger.info("Started QTL evaluation..."); long t1 = System.currentTimeMillis(); List<String> queries = dataset.getSparqlQueries().values().stream().map(q -> q.toString()) .collect(Collectors.toList()); logger.info("#loaded queries: " + queries.size()); // filter for debugging purposes queries = queries.stream().filter(q -> tokens.stream().noneMatch(t -> !q.contains(t))) .collect(Collectors.toList()); if (maxNrOfProcessedQueries == -1) { maxNrOfProcessedQueries = queries.size(); } // queries = filter(queries, (int) Math.ceil((double) maxNrOfProcessedQueries / maxTreeDepth)); // queries = queries.subList(0, Math.min(queries.size(), maxNrOfProcessedQueries)); logger.info("#queries to process: " + queries.size()); // generate examples for each query logger.info("precomputing pos. and neg. examples..."); final Map<String, ExampleCandidates> query2Examples = new HashMap<>(); for (String query : queries) {//if(!(query.contains("Borough_(New_York_City)")))continue; query2Examples.put(query, generateExamples(query)); } logger.info("precomputing pos. and neg. examples finished."); // check for queries that do not return any result (should not happen, but we never know) Set<String> emptyQueries = query2Examples.entrySet().stream() .filter(e -> e.getValue().correctPosExampleCandidates.isEmpty()).map(e -> e.getKey()) .collect(Collectors.toSet()); logger.info("got {} empty queries.", emptyQueries.size()); queries.removeAll(emptyQueries); // min. pos examples Set<String> lowNrOfExamplesQueries = query2Examples.entrySet().stream() .filter(e -> e.getValue().correctPosExampleCandidates.size() < 2).map(e -> e.getKey()) .collect(Collectors.toSet()); logger.info("got {} queries with < 2 pos. examples.", emptyQueries.size()); queries.removeAll(lowNrOfExamplesQueries); final int totalNrOfQTLRuns = heuristics.length * this.measures.length * nrOfExamplesIntervals.length * noiseIntervals.length * queries.size(); logger.info("#QTL runs: " + totalNrOfQTLRuns); final AtomicInteger currentNrOfFinishedRuns = new AtomicInteger(0); // loop over heuristics for (final QueryTreeHeuristic heuristic : heuristics) { final String heuristicName = heuristic.getClass().getAnnotation(ComponentAnn.class).shortName(); // loop over heuristics measures for (HeuristicType measure : this.measures) { final String measureName = measure.toString(); heuristic.setHeuristicType(measure); double[][] data = new double[nrOfExamplesIntervals.length][noiseIntervals.length]; // loop over number of positive examples for (int i = 0; i < nrOfExamplesIntervals.length; i++) { final int nrOfExamples = nrOfExamplesIntervals[i]; // loop over noise value for (int j = 0; j < noiseIntervals.length; j++) { final double noise = noiseIntervals[j]; // check if not already processed File logFile = new File(benchmarkDirectory, "qtl2-" + nrOfExamples + "-" + noise + "-" + heuristicName + "-" + measureName + ".log"); File statsFile = new File(benchmarkDirectory, "qtl2-" + nrOfExamples + "-" + noise + "-" + heuristicName + "-" + measureName + ".stats"); if (!override && logFile.exists() && statsFile.exists()) { logger.info( "Eval config already processed. For re-running please remove corresponding output files."); continue; } FileAppender appender = null; try { appender = new FileAppender(new SimpleLayout(), logFile.getPath(), false); Logger.getRootLogger().addAppender(appender); } catch (IOException e) { e.printStackTrace(); } logger.info("#examples: " + nrOfExamples + " noise: " + noise); final DescriptiveStatistics nrOfReturnedSolutionsStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselinePrecisionStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselineRecallStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselineFMeasureStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselinePredAccStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics baselineMathCorrStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionPrecisionStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionRecallStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionFMeasureStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionPredAccStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionMathCorrStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestReturnedSolutionRuntimeStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionPrecisionStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionRecallStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionFMeasureStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionPredAccStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionMathCorrStats = new SynchronizedDescriptiveStatistics(); final DescriptiveStatistics bestSolutionPositionStats = new SynchronizedDescriptiveStatistics(); MonitorFactory.getTimeMonitor(TimeMonitors.CBD_RETRIEVAL.name()).reset(); MonitorFactory.getTimeMonitor(TimeMonitors.TREE_GENERATION.name()).reset(); ExecutorService tp = Executors.newFixedThreadPool(nrOfThreads); // indicates if the execution for some of the queries failed final AtomicBoolean failed = new AtomicBoolean(false); // loop over SPARQL queries for (final String sparqlQuery : queries) { tp.submit(() -> { logger.info("##############################################################"); logger.info("Processing query\n" + sparqlQuery); try { ExamplesWrapper examples = query2Examples.get(sparqlQuery).get(nrOfExamples, nrOfExamples, noise); logger.info( "pos. examples:\n" + Joiner.on("\n").join(examples.correctPosExamples)); logger.info( "neg. examples:\n" + Joiner.on("\n").join(examples.correctNegExamples)); // write examples to disk File dir = new File(benchmarkDirectory, "data/" + hash(sparqlQuery)); dir.mkdirs(); Files.write(Joiner.on("\n").join(examples.correctPosExamples), new File(dir, "examples_" + nrOfExamples + "_" + noise + ".tp"), Charsets.UTF_8); Files.write(Joiner.on("\n").join(examples.correctNegExamples), new File(dir, "examples_" + nrOfExamples + "_" + noise + ".tn"), Charsets.UTF_8); Files.write(Joiner.on("\n").join(examples.falsePosExamples), new File(dir, "examples_" + nrOfExamples + "_" + noise + ".fp"), Charsets.UTF_8); // compute baseline logger.info("Computing baseline..."); RDFResourceTree baselineSolution = applyBaseLine(examples, Baseline.MOST_INFORMATIVE_EDGE_IN_EXAMPLES); logger.info("Baseline solution:\n" + owlRenderer .render(QueryTreeUtils.toOWLClassExpression(baselineSolution))); logger.info("Evaluating baseline..."); Score baselineScore = computeScore(sparqlQuery, baselineSolution, noise); logger.info("Baseline score:\n" + baselineScore); String baseLineQuery = QueryTreeUtils.toSPARQLQueryString(baselineSolution, dataset.getBaseIRI(), dataset.getPrefixMapping()); baselinePrecisionStats.addValue(baselineScore.precision); baselineRecallStats.addValue(baselineScore.recall); baselineFMeasureStats.addValue(baselineScore.fmeasure); baselinePredAccStats.addValue(baselineScore.predAcc); baselineMathCorrStats.addValue(baselineScore.mathCorr); // run QTL PosNegLPStandard lp = new PosNegLPStandard(); lp.setPositiveExamples(examples.posExamplesMapping.keySet()); lp.setNegativeExamples(examples.negExamplesMapping.keySet()); QTL2Disjunctive la = new QTL2Disjunctive(lp, qef); la.setRenderer(new org.dllearner.utilities.owl.DLSyntaxObjectRenderer()); la.setReasoner(dataset.getReasoner()); la.setEntailment(Entailment.SIMPLE); la.setTreeFactory(queryTreeFactory); la.setPositiveExampleTrees(examples.posExamplesMapping); la.setNegativeExampleTrees(examples.negExamplesMapping); la.setNoise(noise); la.setHeuristic(heuristic); la.setMaxExecutionTimeInSeconds(maxExecutionTimeInSeconds); la.setMaxTreeComputationTimeInSeconds(maxExecutionTimeInSeconds); la.init(); la.start(); List<EvaluatedRDFResourceTree> solutions = new ArrayList<>(la.getSolutions()); // List<EvaluatedRDFResourceTree> solutions = generateSolutions(examples, noise, heuristic); nrOfReturnedSolutionsStats.addValue(solutions.size()); // the best returned solution by QTL EvaluatedRDFResourceTree bestSolution = solutions.get(0); logger.info("Got " + solutions.size() + " query trees."); logger.info("Best computed solution:\n" + render(bestSolution.asEvaluatedDescription())); logger.info("QTL Score:\n" + bestSolution.getTreeScore()); long runtimeBestSolution = la.getTimeBestSolutionFound(); bestReturnedSolutionRuntimeStats.addValue(runtimeBestSolution); // convert to SPARQL query RDFResourceTree tree = bestSolution.getTree(); // filter.filter(tree); String learnedSPARQLQuery = QueryTreeUtils.toSPARQLQueryString(tree, dataset.getBaseIRI(), dataset.getPrefixMapping()); // compute score Score score = computeScore(sparqlQuery, tree, noise); bestReturnedSolutionPrecisionStats.addValue(score.precision); bestReturnedSolutionRecallStats.addValue(score.recall); bestReturnedSolutionFMeasureStats.addValue(score.fmeasure); bestReturnedSolutionPredAccStats.addValue(score.predAcc); bestReturnedSolutionMathCorrStats.addValue(score.mathCorr); logger.info(score.toString()); // find the extensionally best matching tree in the list Pair<EvaluatedRDFResourceTree, Score> bestMatchingTreeWithScore = findBestMatchingTreeFast( solutions, sparqlQuery, noise, examples); EvaluatedRDFResourceTree bestMatchingTree = bestMatchingTreeWithScore .getFirst(); Score bestMatchingScore = bestMatchingTreeWithScore.getSecond(); // position of best tree in list of solutions int positionBestScore = solutions.indexOf(bestMatchingTree); bestSolutionPositionStats.addValue(positionBestScore); Score bestScore = score; if (positionBestScore > 0) { logger.info("Position of best covering tree in list: " + positionBestScore); logger.info("Best covering solution:\n" + render(bestMatchingTree.asEvaluatedDescription())); logger.info("Tree score: " + bestMatchingTree.getTreeScore()); bestScore = bestMatchingScore; logger.info(bestMatchingScore.toString()); } else { logger.info("Best returned solution was also the best covering solution."); } bestSolutionRecallStats.addValue(bestScore.recall); bestSolutionPrecisionStats.addValue(bestScore.precision); bestSolutionFMeasureStats.addValue(bestScore.fmeasure); bestSolutionPredAccStats.addValue(bestScore.predAcc); bestSolutionMathCorrStats.addValue(bestScore.mathCorr); for (RDFResourceTree negTree : examples.negExamplesMapping.values()) { if (QueryTreeUtils.isSubsumedBy(negTree, bestMatchingTree.getTree())) { Files.append(sparqlQuery + "\n", new File("/tmp/negCovered.txt"), Charsets.UTF_8); break; } } String bestQuery = QueryFactory.create(QueryTreeUtils.toSPARQLQueryString( filter.apply(bestMatchingTree.getTree()), dataset.getBaseIRI(), dataset.getPrefixMapping())).toString(); if (write2DB) { write2DB(sparqlQuery, nrOfExamples, examples, noise, baseLineQuery, baselineScore, heuristicName, measureName, QueryFactory.create(learnedSPARQLQuery).toString(), score, runtimeBestSolution, bestQuery, positionBestScore, bestScore); } } catch (Exception e) { failed.set(true); logger.error("Error occured for query\n" + sparqlQuery, e); try { StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); e.printStackTrace(pw); Files.append(sparqlQuery + "\n" + sw.toString(), new File(benchmarkDirectory, "failed-" + nrOfExamples + "-" + noise + "-" + heuristicName + "-" + measureName + ".txt"), Charsets.UTF_8); } catch (IOException e1) { e1.printStackTrace(); } } finally { int cnt = currentNrOfFinishedRuns.incrementAndGet(); logger.info("***********Evaluation Progress:" + NumberFormat.getPercentInstance() .format((double) cnt / totalNrOfQTLRuns) + "(" + cnt + "/" + totalNrOfQTLRuns + ")" + "***********"); } }); } tp.shutdown(); tp.awaitTermination(12, TimeUnit.HOURS); Logger.getRootLogger().removeAppender(appender); if (!failed.get()) { String result = ""; result += "\nBaseline Precision:\n" + baselinePrecisionStats; result += "\nBaseline Recall:\n" + baselineRecallStats; result += "\nBaseline F-measure:\n" + baselineFMeasureStats; result += "\nBaseline PredAcc:\n" + baselinePredAccStats; result += "\nBaseline MathCorr:\n" + baselineMathCorrStats; result += "#Returned solutions:\n" + nrOfReturnedSolutionsStats; result += "\nOverall Precision:\n" + bestReturnedSolutionPrecisionStats; result += "\nOverall Recall:\n" + bestReturnedSolutionRecallStats; result += "\nOverall F-measure:\n" + bestReturnedSolutionFMeasureStats; result += "\nOverall PredAcc:\n" + bestReturnedSolutionPredAccStats; result += "\nOverall MathCorr:\n" + bestReturnedSolutionMathCorrStats; result += "\nTime until best returned solution found:\n" + bestReturnedSolutionRuntimeStats; result += "\nPositions of best solution:\n" + Arrays.toString(bestSolutionPositionStats.getValues()); result += "\nPosition of best solution stats:\n" + bestSolutionPositionStats; result += "\nOverall Precision of best solution:\n" + bestSolutionPrecisionStats; result += "\nOverall Recall of best solution:\n" + bestSolutionRecallStats; result += "\nOverall F-measure of best solution:\n" + bestSolutionFMeasureStats; result += "\nCBD generation time(total):\t" + MonitorFactory.getTimeMonitor(TimeMonitors.CBD_RETRIEVAL.name()).getTotal() + "\n"; result += "CBD generation time(avg):\t" + MonitorFactory.getTimeMonitor(TimeMonitors.CBD_RETRIEVAL.name()).getAvg() + "\n"; result += "Tree generation time(total):\t" + MonitorFactory.getTimeMonitor(TimeMonitors.TREE_GENERATION.name()).getTotal() + "\n"; result += "Tree generation time(avg):\t" + MonitorFactory.getTimeMonitor(TimeMonitors.TREE_GENERATION.name()).getAvg() + "\n"; result += "Tree size(avg):\t" + treeSizeStats.getMean() + "\n"; logger.info(result); try { Files.write(result, statsFile, Charsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } data[i][j] = bestReturnedSolutionFMeasureStats.getMean(); if (write2DB) { write2DB(heuristicName, measureName, nrOfExamples, noise, bestReturnedSolutionFMeasureStats.getMean(), bestReturnedSolutionPrecisionStats.getMean(), bestReturnedSolutionRecallStats.getMean(), bestReturnedSolutionPredAccStats.getMean(), bestReturnedSolutionMathCorrStats.getMean(), bestSolutionPositionStats.getMean(), bestSolutionFMeasureStats.getMean(), bestSolutionPrecisionStats.getMean(), bestSolutionRecallStats.getMean(), bestSolutionPredAccStats.getMean(), bestSolutionMathCorrStats.getMean(), baselineFMeasureStats.getMean(), baselinePrecisionStats.getMean(), baselineRecallStats.getMean(), baselinePredAccStats.getMean(), baselineMathCorrStats.getMean(), bestReturnedSolutionRuntimeStats.getMean()); } } } } String content = "###"; String separator = "\t"; for (double noiseInterval1 : noiseIntervals) { content += separator + noiseInterval1; } content += "\n"; for (int i = 0; i < nrOfExamplesIntervals.length; i++) { content += nrOfExamplesIntervals[i]; for (int j = 0; j < noiseIntervals.length; j++) { content += separator + data[i][j]; } content += "\n"; } File examplesVsNoise = new File(benchmarkDirectory, "examplesVsNoise-" + heuristicName + "-" + measureName + ".tsv"); try { Files.write(content, examplesVsNoise, Charsets.UTF_8); } catch (IOException e) { logger.error("failed to write stats to file", e); } } } if (write2DB) { conn.close(); } if (useEmailNotification) { sendFinishedMail(); } long t2 = System.currentTimeMillis(); long duration = t2 - t1; logger.info("QTL evaluation finished in " + DurationFormatUtils.formatDurationHMS(duration) + "ms."); }
From source file:org.hawkular.client.test.metrics.openshift.CollectionRateDetailTest.java
private void getData(String metricID, String testID, long start, long end, Duration timeBucket) { Reporter.log("Fetching large data set... may take a couple minutes", true); List<DataPoint<Double>> rawData = client().metrics().gauge() .findGaugeDataWithId(metricID, String.valueOf(start), String.valueOf(end), null, null, null) .getEntity();// w w w. j a v a 2 s. c o m Assert.assertNotNull(rawData, testID); Reporter.log("raw datapoints: " + rawData.size(), true); List<Long> zeroList = findZeroValues(rawData); Assert.assertTrue(zeroList == null || zeroList.size() == 0, testID); Map<Long, Integer> hist = OpenshiftBaseTest.makeHistogram(rawData, timeBucket); Double[] result = hist.entrySet().stream().map(x -> new Double(x.getValue())) .toArray(size -> new Double[size]); double[] d = ArrayUtils.toPrimitive(result); // drop the first and last as they are usually outliers double[] samples = Arrays.copyOfRange(d, 1, d.length - 1); DescriptiveStatistics stats = new DescriptiveStatistics(samples); Reporter.log(hist.toString(), true); Reporter.log("size: " + stats.getN(), true); Reporter.log("min/max: " + stats.getMin() + "/" + stats.getMax(), true); Reporter.log("mean: " + stats.getMean(), true); Reporter.log("variance: " + stats.getVariance(), true); Reporter.log("stddev: " + stats.getStandardDeviation(), true); }