from __future__ import division
from scipy import c_, ones, dot, stats, diff
import scipy.linalg
from numpy import log, pi, sqrt, square, diagonal
[docs]class OLS(object):
"""
Ordinary least squares multivariate regression.
"""
def __init__(self):
"""Initializing the OLS class.
"""
pass
[docs] def train(self,X,y,useQR = True, addConstant = True):
'''Solve y = Xb.
Parameters
----------
x : array, shape (M, N)
y : array, shape (M,)
useQR : boolean
Whether or not to use QR decomposition to fit regression line.
addConstant: boolean
Whether or not to add a constant column to X
'''
if y.shape[0] != X.shape[0]:
raise ValueError('incompatible dimensions')
if addConstant:
self.X = c_[ones(X.shape[0]), X]
self.y = y
self.X_columns = getattr(X,'columns', None)
self.y_columns = getattr(y,'columns', None)
if useQR:
# TODO: Ehh, this is broken. Need to fix.
Q,R = scipy.linalg.qr(self.X)
Qty = dot(Q.T, y)
self.b = scipy.linalg.solve(R,Qty)
else:
self.inv_xx = inv(dot(self.X.T,self.X))
xy = dot(self.X.T,self.y)
self.b = dot(self.inv_xx,xy)
self.computeStatistics()
[docs] def computeStatistics(self):
self.nobs = self.y.shape[0] # number of observations
self.ncoef = self.X.shape[1] # number of coef.
self.df_e = self.nobs - self.ncoef # degrees of freedom, error
self.df_r = self.ncoef - 1 # degrees of freedom, regression
self.e = self.y - dot(self.X,self.b) # residuals
self.sse = dot(self.e,self.e)/self.df_e # SSE
self.se = sqrt(diagonal(self.sse*self.inv_xx)) # coef. standard errors
self.t = self.b / self.se # coef. t-statistics
self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2 # coef. p-values
self.R2 = 1 - self.e.var()/self.y.var() # model R-squared
self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef)) # adjusted R-square
self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e) # model F-statistic
self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e) # F-statistic p-value
[docs] def dw(self):
"""Calculates the Durbin-Waston statistic
"""
de = diff(self.e,1)
dw = dot(de,de) / dot(self.e,self.e)
return dw
[docs] def omni(self):
"""Omnibus test for normality
"""
return stats.normaltest(self.e)
[docs] def JB(self):
"""Calculate residual skewness, kurtosis, and do the JB test for normality
"""
# Calculate residual skewness and kurtosis
skew = stats.skew(self.e)
kurtosis = 3 + stats.kurtosis(self.e)
# Calculate the Jarque-Bera test for normality
JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3))
JBpv = 1-stats.chi2.cdf(JB,2);
return JB, JBpv, skew, kurtosis
[docs] def ll(self):
"""Calculate model log-likelihood and two information criteria
"""
# Model log-likelihood, AIC, and BIC criterion values
ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs)
aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs)
bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs
return ll, aic, bic
def __str__(self):
# extra stats
ll, aic, bic = self.ll()
JB, JBpv, skew, kurtosis = self.JB()
omni, omnipv = self.omni()
output = GREEN + \
'==============================================================================\n' + \
'OLS Fit \n' + \
'Dependent Variable: ' + self.y_varnm + '\nN: %5.0f \np: %5.0f \n' % (self.nobs, self.ncoef) + \
'Models stats Residual stats \n' + \
'==============================================================================\n' + ENDCOLOR + \
'R-squared % -5.6f Durbin-Watson stat % -5.6f \n' % (self.R2, self.dw()) + \
'Adjusted R-squared % -5.6f Omnibus stat % -5.6f \n' % (self.R2adj, omni) + \
'F-statistic % -5.3f Prob(Omnibus stat) % -5.6f \n' % (self.F, omnipv) + \
'Prob (F-statistic) % -5.6f JB stat % -5.6f \n' % (self.Fpv, JB) + \
'Log likelihood % -5.3f Prob(JB) % -5.6f \n' % (ll, JBpv) + \
'AIC criterion % -5.6f Skew % -5.6f \n' % (aic, skew) + \
'BIC criterion % -5.6f Kurtosis % -5.6f \n' % (bic, kurtosis)+GREEN + \
'==============================================================================\n' + \
'variable coefficient std. Error t-statistic prob \n' + \
'==============================================================================\n' + ENDCOLOR
output += 'Intercept\t % -5.6f % -5.6f % -5.6f % -5.6f\n' % (self.b[0],self.se[0],self.t[0],self.p[0])
for i in range(len(self.X_columns)):
output += '% -5s % -5.6f % -5.6f % -5.6f % -5.6f\n' % (self.X_columns[i],self.b[i+1],self.se[i+1],self.t[i+1],self.p[i+1])
return output