import numpy as np
import scipy, csv, pandas
from qikify.models.dotdict import dotdict
import logging, os
[docs]def create_logger(logmodule):
# create logger with 'spam_application'
logger = logging.getLogger(logmodule)
logger.setLevel(logging.INFO)
# create file handler which logs even debug messages
try:
import ConfigParser
config = ConfigParser.RawConfigParser()
config.read(os.path.expanduser('~/.qikifyrc'))
logdir = config.get('Logging', 'logdir')
except:
logdir = '/tmp/qikify/test'
if not os.path.exists(logdir):
os.makedirs(logdir)
logfile = os.path.join(logdir, '%s.log' % logmodule)
# create file log handler
fh = logging.FileHandler(logfile)
fh.setLevel(logging.INFO)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)
return logger
[docs]def bool2symmetric(data):
"""Changes True/False data to +1/-1 symmetric.
"""
return np.array((data-0.5)*2.0,dtype = int)
[docs]def standardize(X, scaleDict = None, reverse = False):
"""Facilitates standardizing data by subtracting the mean and dividing by
the standard deviation. Set reverse to True to perform the inverse
operation.
Parameters
----------
X : numpy ndarray, or pandas.DataFrame
Data for which we want pareto-optimal front.
scaleDict: dict, default None
Dictionary with elements mean/std to control standardization.
reverse: boolean, default False
If this flag is set, the standardization will be reversed; e.g.,
we take a dataset with zero mean and unit variance and change to
dataset with mean=scaleDict.mean and std=scaleDict.std.
Examples
--------
TODO
"""
if reverse:
return (X * scaleDict.std) + scaleDict.mean
elif scaleDict is None:
scaleDict = dotdict({'mean': X.mean(0).tolist(), 'std': X.std(0).tolist()})
return scaleDict, (X - scaleDict.mean) / scaleDict.std
else:
return (X - scaleDict.mean) / scaleDict.std
[docs]def zeroMatrixDiagonal(X):
"""Set the diagonal of a matrix to all zeros.
Parameters
----------
X : numpy ndarray
Matrix on which to zero out the diagonal.
Examples
--------
Xp = zeroMatrixDiagonal(X)
"""
return X - np.diag(np.diag(X))
[docs]def getParetoFront(data):
"""Extracts the 2D Pareto-optimal front from a 2D numpy array.
Parameters
----------
data : numpy ndarray, or pandas.DataFrame
Data for which we want pareto-optimal front.
Examples
--------
p = getParetoFront(data)
"""
dflags = np.ones(data.shape[0], dtype=bool)
for i in xrange(data.shape[0]):
point = data[i,:]
for j in xrange(data.shape[0]):
if i == j:
continue
if np.all(point > data[j,:]):
dflags[i] = False
return np.array(data[dflags,:])
[docs]def is1D(data):
"""Determine if data is 1-dimensional.
"""
return data.shape[0] == np.size(data)
[docs]def partition(data, threshold=0.5, verbose = False):
"""Partitions data into training and test sets. Assumes the last column of
data is y.
Parameters
----------
data : numpy ndarray, or pandas.DataFrame
Data to partition into training and test sets.
threshold : float
Determines ratio of training : test.
Examples
--------
TODO
"""
if data.ndim != 2:
raise Exception, 'data must be 2-dimensional'
nrow, ncol = data.shape
# create boolean vector identifying rows in training/test sets.
index = np.random.sample(nrow)
train_index = index < threshold
test_index = index >= threshold
if isinstance(data, pandas.DataFrame):
xtrain = data.ix[train_index,:ncol-1]
ytrain = data.ix[train_index,ncol-1]
xtest = data.ix[test_index,:ncol-1]
ytest = data.ix[test_index,ncol-1]
elif isinstance(data, np.ndarray):
xtrain = data[train_index,:-1]
ytrain = data[train_index,-1]
xtest = data[test_index,:-1]
ytest = data[test_index,-1]
else:
raise Exception, 'data must be numpy.ndarray or pandas.DataFrame'
if verbose:
print 'Randomly partitioned data, with threshold={0}'.format(threshold)
print '{:<10} nrow: {:<4} ncol: {:<4}'.format('xtrain', *xtrain.shape)
print '{:<10} nrow: {:<4} ncol: {:<4}'.format('ytrain', ytrain.size, 1)
print '{:<10} nrow: {:<4} ncol: {:<4}'.format('xtest', *xtest.shape)
print '{:<10} nrow: {:<4} ncol: {:<4}'.format('ytest', ytest.size, 1)
return xtrain, ytrain, xtest, ytest
[docs]def nmse(yhat, y, min_y=None, max_y=None):
"""Calculates the normalized mean-squared error.
Parameters
----------
yhat : 1d array or list of floats
estimated values of y
y : 1d array or list of floats
true values
min_y, max_y : float, float
roughly the min and max; they do not have to be the perfect values of min and max, because
they're just here to scale the output into a roughly [0,1] range
Examples
--------
nmse = nmse(yhat, y)
"""
#base case: no entries
if len(yhat) == 0:
return 0.0
#base case: both yhat and y are constant, and same values
if (max_y == min_y) and (max(yhat) == min(yhat) == max(y) == min(y)):
return 0.0
#main case
assert max_y > min_y, 'max_y=%g was not > min_y=%g' % (max_y, min_y)
yhat_a, y_a = np.asarray(yhat), np.asarray(y)
y_range = float(max_y - min_y)
try:
result = np.sqrt(np.mean(((yhat_a - y_a) / y_range) ** 2))
if scipy.isnan(result):
return np.Inf
return result
except ValueError:
print 'Invalid result %d' % (result)
return np.Inf
[docs]def computeR2(yhat, y):
"""Computes R-squared coefficient of determination.
R2 = 1 - sum((y_hat - y_test)**2) / sum((y_test - np.mean(y_test))**2)
Parameters
----------
yhat : 1d array or list of floats -- estimated values of y
y : 1d array or list of floats -- true values
Examples
--------
r2 = computeR2(yhat, y)
"""
#e = y - yhat # residuals
#return 1 - e.var()/y.var() # model R-squared
#y_bar = np.mean(y)
#SSReg = sum((yhat - y_bar)**2)
#SST = sum((y - y_bar)**2)
#return SSReg/SST
return np.corrcoef(yhat, y)[0,1]**2