Source code for qikify.controllers.KDE

import sys, os, random, pandas
import numpy as np
from scipy.special import gamma
from qikify.helpers import standardize
from qikify.models.specs import Specs
from qikify.controllers.slicesample import slicesample

[docs]class KDE(object): """This class implements non-parametric kernel density estimation. """
[docs] def run(self, X, specs = None, nSamples = 0, counts = None, a = 0, bounds = None): """Primary execution point. Run either standard KDE or class-membership based KDE. If any of the class-membership based KDE arguments are set, it will be run instead of standard KDE. Parameters ---------- X : array_like Contains data stored in a pandas.DataFrame. nSamples : int The number of samples to generate. specs : qikify.models.Specs, optional If using partitioned sampling, boundaries defining pass/critical/fail subspaces must be provided. counts : dict, optional If using partitioned sampling, counts dictionary must be provided, with three keys: nGood, nCritical, nFail. """ self.n, self.d = X.shape self.specs = specs self.columns = getattr(X, 'columns', None) # Normalize data/bounds self.scaleFactors, self.Xn = standardize(X) self.bounds = standardize(np.array([X.min(0), X.max(0)]), self.scaleFactors) if bounds is not None: self.bounds = standardize(bounds, self.scaleFactors) # Select bandwidth for Epanechnikov kernel (Rule of Thumb, see Silverman, p.86) self.b = 0.8 # Default bandwidth scaling factor self.c_d = 2.0* pow( np.pi, (self.d/2.0) ) / ( self.d * gamma(self.d/2) ) self.h = self._compute_h(self.n, self.d, self.c_d, self.b) self._setBandwithFactors(a) # Generate samples if counts is None: return self._genSamples(nSamples) else: print 'KDE: Running on dataset of size n:', self.n, 'd:', self.d,'and generating', sum(counts.values()), 'samples.' self._genSpecLimits(X, specs) return self._genPartitionedSamples(counts) # =============== Private class methods =============== # Default method of generating device samples
def _genSpecLimits(self, X, specs): """We convert spec lims to arrays so spec compare during device sample generation is fast. """ self.inner = np.zeros((2,X.shape[1])) self.outer = np.zeros((2,X.shape[1])) for i, name in enumerate(X.columns): self.inner[:,i] = self.specs.inner[name] if name in self.specs.inner.keys() else [-np.inf, np.inf] self.outer[:,i] = self.specs.outer[name] if name in self.specs.outer.keys() else [-np.inf, np.inf] def _genSamples(self, nSamples): """Generate KDE samples.""" Sn = np.vstack([ self._genSample() for _ in xrange(nSamples) ]) return pandas.DataFrame(standardize(Sn, self.scaleFactors, reverse = True), columns = self.columns) def _genPartitionedSamples(self, counts): """Generates nCritical critical devices, nGood good devices, nFail failing devices, with each region defined by specs.inner / specs.outer.""" # Initialize arrays for speed Sg, Sc, Sf = np.zeros((counts.nGood, self.d)), np.zeros((counts.nCritical, self.d)), np.zeros((counts.nFail, self.d)) ng, nc, nf = 0, 0, 0 thresh = 0.02 while ( ng+nc+nf < sum(counts.values()) ): sample = standardize(self._genSample(), self.scaleFactors, reverse = True) if self._isGood(sample) and ng < counts.nGood: Sg[ng,:] = sample ng += 1 if self._isFailing(sample) and nf < counts.nFail: Sf[nf,:] = sample nf += 1 if self._isCritical(sample) and nc < counts.nCritical: Sc[nc,:] = sample nc += 1 # Prints # generated in each category so we can monitor progress, since this can take a while :) if (1.0*(ng+nc+nf)/sum(counts.values())) > thresh: print 'Ng:%i/%i Nc:%i/%i Nf:%i/%i' % (ng,counts.nGood,nc,counts.nCritical,nf,counts.nFail) thresh += 0.02 print 'Non-parametric density estimation sampling complete.' return pandas.DataFrame(np.vstack((Sc,Sg,Sf)), columns=self.columns) def _genSample(self): """Generate a single sample using algorithm in Silverman, p. 143""" j = np.random.randint(self.n) e = slicesample(np.zeros(self.d), 1, self._K_e) s = self.Xn.ix[j,:] + self.h * self.lambdas[j] * e # Use mirroring technique to deal with boundary conditions. # Perhaps we should consider applying boundary kernels here... for k in xrange(self.d): if (s[k] < self.bounds[0,k]): s[k] = self.bounds[0,k]+abs(self.bounds[0,k]-s[k]) if (s[k] > self.bounds[1,k]): s[k] = self.bounds[1,k]-abs(self.bounds[1,k]-s[k]) return s def _setBandwithFactors(self, a): """Estimate local bandwidth factors lambda""" self.lambdas = np.ones(self.n) if (a > 0): B = [log10(self._f_pilot(self.Xn.ix[i,:], self.Xn)) for i in xrange(self.n)] g = pow(10,sum(B)/self.n) self.lambdas = [pow(self._f_pilot(self.Xn.ix[i,:], self.Xn)/g,-a) for i in xrange(self.n)] def _compute_h(self, n, d, c_d, b): """Computed here seperately to preserve readability of the equation.""" return b * pow( (8/c_d*(d+4) * pow(2*np.sqrt(np.pi),d) ), (1.0/(d+4))) * pow(n,-1.0/(d+4)) def _K_e(self, t): """Epanechnikov kernel""" return (self.d+2)/self.c_d*(1-np.dot(t,t))/2 if np.dot(t,t) < 1 else 0 def _f_pilot(self, x, Xn): """Compute pilot density point estimate f(x)""" A = [self._K_e((x-Xn.ix[i,:])/self.h) for i in xrange(n)] return (pow(self.h,-d)*sum(A))/n # =============== Partitioned Sampling Methods =============== def _isGood(self, sample): return all(sample > self.inner[0,:]) and all(sample < self.inner[1,:]) def _isCritical(self, sample): return not (self._isGood(sample) or self._isFailing(sample)) def _isFailing(self, sample): return (any(sample < self.outer[0,:]) or any(sample > self.outer[1,:]))