# -*- coding: utf-8 -*-
"""linear model with Theil prior probabilistic restrictions, generalized Ridge
Created on Tue Dec 20 00:10:10 2011
Author: Josef Perktold
License: BSD-3
open issues
* selection of smoothing factor, strength of prior, cross validation
* GLS, does this really work this way
* None of inherited results have been checked yet,
I'm not sure if any need to be adjusted or if only interpretation changes
One question is which results are based on likelihood (residuals) and which
are based on "posterior" as for example bse and cov_params
* helper functions to construct priors?
* increasing penalization for ordered regressors, e.g. polynomials
* compare with random/mixed effects/coefficient, like estimated priors
there is something fishy with the result instance, some things, e.g.
normalized_cov_params, don't look like they update correctly as we
search over lambda -> some stale state again ?
I added df_model to result class using the hatmatrix, but df_model is defined
in model instance not in result instance. -> not clear where refactoring should
occur. df_resid doesn't get updated correctly.
problem with definition of df_model, it has 1 subtracted for constant
"""
from __future__ import print_function
from statsmodels.compat.python import lrange
import numpy as np
import statsmodels.base.model as base
from statsmodels.regression.linear_model import OLS, GLS, RegressionResults
[docs]def atleast_2dcols(x):
x = np.asarray(x)
if x.ndim == 1:
x = x[:,None]
return x
[docs]class TheilGLS(GLS):
'''GLS with probabilistic restrictions
essentially Bayes with informative prior
note: I'm making up the GLS part, might work only for OLS
'''
[docs] def __init__(self, endog, exog, r_matrix, q_matrix=None, sigma_prior=None, sigma=None):
self.r_matrix = np.asarray(r_matrix)
self.q_matrix = atleast_2dcols(q_matrix)
if np.size(sigma_prior) == 1:
sigma_prior = sigma_prior * np.eye(self.r_matrix.shape[0]) #no numerical shortcuts
self.sigma_prior = sigma_prior
self.sigma_prior_inv = np.linalg.pinv(sigma_prior) #or inv
super(self.__class__, self).__init__(endog, exog, sigma=sigma)
[docs] def fit(self, lambd=1.):
#this does duplicate transformation, but I need resid not wresid
res_gls = GLS(self.endog, self.exog, sigma=self.sigma).fit()
self.res_gls = res_gls
sigma2_e = res_gls.mse_resid
r_matrix = self.r_matrix
q_matrix = self.q_matrix
sigma_prior_inv = self.sigma_prior_inv
x = self.wexog
y = self.wendog[:,None]
#why are sigma2_e * lambd multiplied, not ratio?
#larger lambd -> stronger prior (it's not the variance)
#print('lambd inside fit', lambd
xpx = np.dot(x.T, x) + \
sigma2_e * lambd * np.dot(r_matrix.T, np.dot(sigma_prior_inv, r_matrix))
xpy = np.dot(x.T, y) + \
sigma2_e * lambd * np.dot(r_matrix.T, np.dot(sigma_prior_inv, q_matrix))
#xpy = xpy[:,None]
xpxi = np.linalg.pinv(xpx)
params = np.dot(xpxi, xpy) #or solve
params = np.squeeze(params)
self.normalized_cov_params = xpxi #why attach it to self, i.e. model?
lfit = TheilRegressionResults(self, params,
normalized_cov_params=xpxi)
lfit.penalization_factor = lambd
return lfit
[docs] def fit_minic(self):
#this doesn't make sense, since number of parameters stays unchanged
#need leave-one-out, gcv; or some penalization for weak priors
#added extra penalization for lambd
def get_bic(lambd):
#return self.fit(lambd).bic #+lambd #+ 1./lambd #added 1/lambd for checking
#return self.fit(lambd).gcv()
#return self.fit(lambd).cv()
return self.fit(lambd).aicc()
from scipy import optimize
lambd = optimize.fmin(get_bic, 1.)
return lambd
#TODO:
#I need the hatmatrix in the model if I want to do iterative fitting, e.g. GCV
#move to model or use it from a results instance inside the model,
# each call to fit returns results instance
[docs]class TheilRegressionResults(RegressionResults):
#cache
[docs] def hatmatrix_diag(self):
'''
diag(X' xpxi X)
where xpxi = (X'X + lambd * sigma_prior)^{-1}
Notes
-----
uses wexog, so this includes weights or sigma - check this case
not clear whether I need to multiply by sigmahalf, i.e.
(W^{-0.5} X) (X' W X)^{-1} (W^{-0.5} X)' or
(W X) (X' W X)^{-1} (W X)'
projection y_hat = H y or in terms of transformed variables (W^{-0.5} y)
might be wrong for WLS and GLS case
'''
xpxi = self.model.normalized_cov_params
#something fishy with self.normalized_cov_params in result, doesn't update
#print(self.model.wexog.shape, np.dot(xpxi, self.model.wexog.T).shape
return (self.model.wexog * np.dot(xpxi, self.model.wexog.T).T).sum(1)
[docs] def hatmatrix_trace(self):
return self.hatmatrix_diag().sum()
#this doesn't update df_resid
@property #needs to be property or attribute (no call)
def df_model(self):
return self.hatmatrix_trace()
#Note: mse_resid uses df_resid not nobs-k_vars, which might differ if df_model, tr(H), is used
#in paper for gcv ess/nobs is used instead of mse_resid
[docs] def gcv(self):
return self.mse_resid / (1. - self.hatmatrix_trace() / self.nobs)**2
[docs] def cv(self):
return ((self.resid / (1. - self.hatmatrix_diag()))**2).sum() / self.nobs
[docs] def aicc(self):
aic = np.log(self.mse_resid) + 1
aic += 2 * (1. + self.hatmatrix_trace()) / (self.nobs - self.hatmatrix_trace() -2)
return aic
#contrast/restriction matrices, temporary location
[docs]def coef_restriction_meandiff(n_coeffs, n_vars=None, position=0):
reduced = np.eye(n_coeffs) - 1./n_coeffs
if n_vars is None:
return reduced
else:
full = np.zeros((n_coeffs, n_vars))
full[:, position:position+n_coeffs] = reduced
return full
[docs]def coef_restriction_diffbase(n_coeffs, n_vars=None, position=0, base_idx=0):
reduced = -np.eye(n_coeffs) #make all rows, drop one row later
reduced[:, base_idx] = 1
keep = lrange(n_coeffs)
del keep[base_idx]
reduced = np.take(reduced, keep, axis=0)
if n_vars is None:
return reduced
else:
full = np.zeros((n_coeffs-1, n_vars))
full[:, position:position+n_coeffs] = reduced
return full
[docs]def next_odd(d):
return d + (1 - d % 2)
[docs]def coef_restriction_diffseq(n_coeffs, degree=1, n_vars=None, position=0, base_idx=0):
#check boundaries, returns "valid" ?
if degree == 1:
diff_coeffs = [-1, 1]
n_points = 2
elif degree > 1:
from scipy import misc
n_points = next_odd(degree + 1) #next odd integer after degree+1
diff_coeffs = misc.central_diff_weights(n_points, ndiv=degree)
dff = np.concatenate((diff_coeffs, np.zeros(n_coeffs - len(diff_coeffs))))
from scipy import linalg
reduced = linalg.toeplitz(dff, np.zeros(n_coeffs - len(diff_coeffs) + 1)).T
#reduced = np.kron(np.eye(n_coeffs-n_points), diff_coeffs)
if n_vars is None:
return reduced
else:
full = np.zeros((n_coeffs-1, n_vars))
full[:, position:position+n_coeffs] = reduced
return full
##
## R = np.c_[np.zeros((n_groups, k_vars-1)), np.eye(n_groups)]
## r = np.zeros(n_groups)
## R = np.c_[np.zeros((n_groups-1, k_vars)),
## np.eye(n_groups-1)-1./n_groups * np.ones((n_groups-1, n_groups-1))]
if __name__ == '__main__':
import numpy as np
import statsmodels.api as sm
examples = [2]
np.random.seed(765367)
np.random.seed(97653679)
nsample = 100
x = np.linspace(0,10, nsample)
X = sm.add_constant(np.column_stack((x, x**2, (x/5.)**3)), prepend=True)
beta = np.array([10, 1, 0.1, 0.5])
y = np.dot(X, beta) + np.random.normal(size=nsample)
res_ols = sm.OLS(y, X).fit()
R = [[0, 0, 0 , 1]]
r = [0] #, 0, 0 , 0]
lambd = 1 #1e-4
mod = TheilGLS(y, X, r_matrix=R, q_matrix=r, sigma_prior=lambd)
res = mod.fit()
print(res_ols.params)
print(res.params)
#example 2
#I need more flexible penalization in example, the penalization should
#get stronger for higher order terms
#np.random.seed(1)
nobs = 200
k_vars = 10
k_true = 6
sig_e = 0.25 #0.5
x = np.linspace(-2,2, nobs)
#X = sm.add_constant(np.column_stack((x, x**2, (x/5.)**3)), prepend=True)
X = (x/x.max())[:,None]**np.arange(k_vars)
beta = np.zeros(k_vars)
beta[:k_true] = np.array([1, -2, 0.5, 1.5, -0.1, 0.1])[:k_true]
y_true = np.dot(X, beta)
y = y_true + sig_e * np.random.normal(size=nobs)
res_ols = sm.OLS(y, X).fit()
#R = np.c_[np.zeros((k_vars-4, 4)), np.eye(k_vars-4)] # has two large true coefficients penalized
not_penalized = 4
R = np.c_[np.zeros((k_vars-not_penalized, not_penalized)), np.eye(k_vars-not_penalized)]
#increasingly strong penalization
R = np.c_[np.zeros((k_vars-not_penalized, not_penalized)), np.diag((1+2*np.arange(k_vars-not_penalized)))]
r = np.zeros(k_vars-not_penalized)
## R = -coef_restriction_diffseq(6, 1, n_vars=10, position=4) #doesn't make sense for polynomial
## R = np.vstack((R, np.zeros(R.shape[1])))
## R[-1,-1] = 1
r = np.zeros(R.shape[0])
lambd = 2 #1e-4
mod = TheilGLS(y, X, r_matrix=R, q_matrix=r, sigma_prior=lambd)
res = mod.fit()
print(res_ols.params)
print(res.params)
res_bic = mod.fit_minic() #this will just return zero
res = mod.fit(res_bic)
print(res_bic)
for lambd in np.linspace(0, 80, 21):
res_l = mod.fit(lambd)
#print(lambd, res_l.params[-2:], res_l.bic, res_l.bic + 1./lambd, res.df_model
print((lambd, res_l.params[-2:], res_l.bic, res.df_model, np.trace(res.normalized_cov_params)))
import matplotlib.pyplot as plt
plt.figure()
plt.plot(beta, 'k-o', label='true')
plt.plot(res_ols.params, '-o', label='ols')
plt.plot(res.params, '-o', label='theil')
plt.legend()
plt.title('Polynomial fitting: estimated coefficients')
plt.figure()
plt.plot(y, 'o')
plt.plot(y_true, 'k-', label='true')
plt.plot(res_ols.fittedvalues, '-', label='ols')
plt.plot(res.fittedvalues, '-', label='theil')
plt.legend()
plt.title('Polynomial fitting: fitted values')
#plt.show()
if 3 in examples:
#example 3
nobs = 600
nobs_i = 20
n_groups = nobs // nobs_i
k_vars = 3
from statsmodels.sandbox.panel.random_panel import PanelSample
dgp = PanelSample(nobs, k_vars, n_groups)
dgp.group_means = 2 + np.random.randn(n_groups) #add random intercept
print('seed', dgp.seed)
y = dgp.generate_panel()
X = np.column_stack((dgp.exog[:,1:],
dgp.groups[:,None] == np.arange(n_groups)))
res_ols = sm.OLS(y, X).fit()
R = np.c_[np.zeros((n_groups, k_vars-1)), np.eye(n_groups)]
r = np.zeros(n_groups)
R = np.c_[np.zeros((n_groups-1, k_vars)),
np.eye(n_groups-1)-1./n_groups * np.ones((n_groups-1, n_groups-1))]
r = np.zeros(n_groups-1)
R[:, k_vars-1] = -1
lambd = 1 #1e-4
mod = TheilGLS(y, X, r_matrix=R, q_matrix=r, sigma_prior=lambd)
res = mod.fit()
print(res.params)
params_l = []
for lambd in np.linspace(0, 20, 21):
params_l.append(mod.fit(5.*lambd).params)
params_l = np.array(params_l)
plt.figure()
plt.plot(params_l.T)
plt.title('Panel Data with random intercept: shrinkage to being equal')
plt.xlabel('parameter index')
plt.figure()
plt.plot(params_l[:,k_vars:])
plt.title('Panel Data with random intercept: shrinkage to being equal')
plt.xlabel('strength of prior')
#plt.show()