Source code for qikify.controllers.LSFS

import numpy as np
import pandas
from scipy.spatial.distance import pdist, squareform
from qikify.helpers import standardize

# Laplacian score feature selection
[docs]class LSFS(object):
[docs] def run(self, Xin, gnd): """Run Laplacian Score Feature Selection. .. note:: Eventually, it'd be nice to maintain col names with Xin so that we can add a plot method to plot scores vs. column names. Notes ----- This code is based on the definition from the paper [1]_: .. math:: X * \frac{\sum_{ij} (f_r^i - f_r^j) * S_{ij}}{sigma_2} .. [1] He, X. and Cai, D. and Niyogi, P., "Laplacian Score for Feature Selection", NIPS 2005. Parameters ---------- Xin : array_like A numpy.ndarray or pandas.DataFrame, with rows corresponding to observations and columns to features. gnd : array_like A numpy.ndarray or pandas.DataFrame pass/fail vector of the same dimension as Xin """ if isinstance(Xin, pandas.DataFrame): X = Xin.as_matrix() else: X = Xin nSmp = X.shape[0] if nSmp != len(gnd): raise Exception("Data and gnd do not have matching sizes") _, X = standardize(X) # Per LSFS paper, S_ij = exp(-||x_i - x_j||^2 / t). I've found that # t = ncol(X) to be a suitable choice; anything on that order should # work just fine. S = self.constructS(X, gnd, t=X.shape[1]) D = sum(S,1) z = (np.dot(D,X) * np.dot(D,X)) / sum(D) DPrime = sum(np.dot(X.T,np.diag(D)).T * X,0) - z LPrime = sum((np.dot(X.T,S).T * X),1) - z # Remove trivial solutions DPrime[DPrime < 1e-12] = np.inf # Compute and retain Laplacian scores and rankings self.Scores = (LPrime/DPrime).T self.Ranking = np.argsort(-self.Scores) del S # Clean up to save memory return self
[docs] def threshold(self, T_L): self.subset = self.Scores > T_L self.nRetained = int(sum(self.subset)) print 'LSFS: retained', GREEN+str(self.nRetained)+ENDCOLOR, 'parameters.' return self.subset # Construct the W matrix used in LSFS
[docs] def constructS(self, X, gnd, k = 0, t = 1, bLDA=False, bSelfConnected=True): label = np.unique(gnd) G = np.zeros((len(gnd),len(gnd))) if bLDA: for i in xrange(len(label)): ind = (gnd==label[i]) G[ix_(ind,ind)] = 1.0/sum(ind) return G else: for i in xrange(len(label)): ind = np.nonzero(gnd==label[i])[0] D = squareform(pdist(X[ind,:], 'sqeuclidean')) # D_ij = ||x_i - x_j||^2 S = np.exp(-D/t) # Per LSFS paper, exp(-||x_i - x_j||^2 / t) self._setSubMat(G, S, ind) if not bSelfConnected: G = zeroMatrixDiagonal(G) return self._genMaxMatrix(G) print 'LSFS: Construction of W matrix complete.'
def _setSubMat(self, X, D, ind): """Set a submatrix of X defined by the index ind to values in D. That is: [0, 0, 0] X = [0, 0, 0] D = [1 2] ind = [0 1] [0, 0, 0] [3 4] Gives: [1, 2, 0] X = [3, 4, 0] [0, 0, 0] """ for i, row in enumerate(ind): X[row,ind] = D[i,:] def _genMaxMatrix(self, A): """Takes a square matrix A and computes max(A, A')""" ind = (A.T - A) > 0 A[ind] = A.T[ind] return A