"""
Multivariate Conditional and Unconditional Kernel Density Estimation
with Mixed Data Types
References
----------
[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice.
Princeton University Press. (2007)
[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation
and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008)
http://dx.doi.org/10.1561/0800000009
[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
with Categorical and Continuous Data." Working Paper. (2000)
[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional
Distributions Annals of Economics and Finance 5, 211-235 (2004)
[5] Liu, R., Yang, L. "Kernel estimation of multivariate
cumulative distribution function."
Journal of Nonparametric Statistics (2008)
[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF
with Categorical and Continuous Data." Working Paper
[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric
regression" Statistica Sinica 14(2004), pp. 485-512
[8] Racine, J.: "Consistent Significance Testing for Nonparametric
Regression" Journal of Business & Economics Statistics
[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of
Categorical Predictor Variables in Nonparametric Regression
Models", 2006, Econometric Reviews 25, 523-544
"""
# TODO: make default behavior efficient=True above a certain n_obs
from statsmodels.compat.python import range, next
import numpy as np
from scipy import optimize
from scipy.stats.mstats import mquantiles
from statsmodels.nonparametric.api import KDEMultivariate, KernelReg
from statsmodels.nonparametric._kernel_base import \
gpke, LeaveOneOut, _get_type_pos, _adjust_shape
__all__ = ['SingleIndexModel', 'SemiLinear', 'TestFForm']
[docs]class SingleIndexModel(KernelReg):
"""
Single index semiparametric model ``y = g(X * b) + e``.
Parameters
----------
endog: array_like
The dependent variable
exog: array_like
The independent variable(s)
var_type: str
The type of variables in X:
- c: continuous
- o: ordered
- u: unordered
Attributes
----------
b: array_like
The linear coefficients b (betas)
bw: array_like
Bandwidths
Methods
-------
fit(): Computes the fitted values ``E[Y|X] = g(X * b)``
and the marginal effects ``dY/dX``.
References
----------
See chapter on semiparametric models in [1]
Notes
-----
This model resembles the binary choice models. The user knows
that X and b interact linearly, but ``g(X * b)`` is unknown.
In the parametric binary choice models the user usually assumes
some distribution of g() such as normal or logistic.
"""
[docs] def __init__(self, endog, exog, var_type):
self.var_type = var_type
self.K = len(var_type)
self.var_type = self.var_type[0]
self.endog = _adjust_shape(endog, 1)
self.exog = _adjust_shape(exog, self.K)
self.nobs = np.shape(self.exog)[0]
self.data_type = self.var_type
self.func = self._est_loc_linear
self.b, self.bw = self._est_b_bw()
def _est_b_bw(self):
params0 = np.random.uniform(size=(self.K + 1, ))
b_bw = optimize.fmin(self.cv_loo, params0, disp=0)
b = b_bw[0:self.K]
bw = b_bw[self.K:]
bw = self._set_bw_bounds(bw)
return b, bw
[docs] def cv_loo(self, params):
# See p. 254 in Textbook
params = np.asarray(params)
b = params[0 : self.K]
bw = params[self.K:]
LOO_X = LeaveOneOut(self.exog)
LOO_Y = LeaveOneOut(self.endog).__iter__()
L = 0
for i, X_not_i in enumerate(LOO_X):
Y = next(LOO_Y)
#print b.shape, np.dot(self.exog[i:i+1, :], b).shape, bw,
G = self.func(bw, endog=Y, exog=-np.dot(X_not_i, b)[:,None],
#data_predict=-b*self.exog[i, :])[0]
data_predict=-np.dot(self.exog[i:i+1, :], b))[0]
#print G.shape
L += (self.endog[i] - G) ** 2
# Note: There might be a way to vectorize this. See p.72 in [1]
return L / self.nobs
[docs] def fit(self, data_predict=None):
if data_predict is None:
data_predict = self.exog
else:
data_predict = _adjust_shape(data_predict, self.K)
N_data_predict = np.shape(data_predict)[0]
mean = np.empty((N_data_predict,))
mfx = np.empty((N_data_predict, self.K))
for i in range(N_data_predict):
mean_mfx = self.func(self.bw, self.endog,
np.dot(self.exog, self.b)[:,None],
data_predict=np.dot(data_predict[i:i+1, :],self.b))
mean[i] = mean_mfx[0]
mfx_c = np.squeeze(mean_mfx[1])
mfx[i, :] = mfx_c
return mean, mfx
def __repr__(self):
"""Provide something sane to print."""
repr = "Single Index Model \n"
repr += "Number of variables: K = " + str(self.K) + "\n"
repr += "Number of samples: nobs = " + str(self.nobs) + "\n"
repr += "Variable types: " + self.var_type + "\n"
repr += "BW selection method: cv_ls" + "\n"
repr += "Estimator type: local constant" + "\n"
return repr
[docs]class SemiLinear(KernelReg):
"""
Semiparametric partially linear model, ``Y = Xb + g(Z) + e``.
Parameters
----------
endog: array_like
The dependent variable
exog: array_like
The linear component in the regression
exog_nonparametric: array_like
The nonparametric component in the regression
var_type: str
The type of the variables in the nonparametric component;
- c: continuous
- o: ordered
- u: unordered
k_linear : int
The number of variables that comprise the linear component.
Attributes
----------
bw: array_like
Bandwidths for the nonparametric component exog_nonparametric
b: array_like
Coefficients in the linear component
nobs : int
The number of observations.
k_linear : int
The number of variables that comprise the linear component.
Methods
-------
fit(): Returns the fitted mean and marginal effects dy/dz
Notes
-----
This model uses only the local constant regression estimator
References
----------
See chapter on Semiparametric Models in [1]
"""
[docs] def __init__(self, endog, exog, exog_nonparametric, var_type, k_linear):
self.endog = _adjust_shape(endog, 1)
self.exog = _adjust_shape(exog, k_linear)
self.K = len(var_type)
self.exog_nonparametric = _adjust_shape(exog_nonparametric, self.K)
self.k_linear = k_linear
self.nobs = np.shape(self.exog)[0]
self.var_type = var_type
self.data_type = self.var_type
self.func = self._est_loc_linear
self.b, self.bw = self._est_b_bw()
def _est_b_bw(self):
"""
Computes the (beta) coefficients and the bandwidths.
Minimizes ``cv_loo`` with respect to ``b`` and ``bw``.
"""
params0 = np.random.uniform(size=(self.k_linear + self.K, ))
b_bw = optimize.fmin(self.cv_loo, params0, disp=0)
b = b_bw[0 : self.k_linear]
bw = b_bw[self.k_linear:]
#bw = self._set_bw_bounds(np.asarray(bw))
return b, bw
[docs] def cv_loo(self, params):
"""
Similar to the cross validation leave-one-out estimator.
Modified to reflect the linear components.
Parameters
----------
params: array_like
Vector consisting of the coefficients (b) and the bandwidths (bw).
The first ``k_linear`` elements are the coefficients.
Returns
-------
L: float
The value of the objective function
References
----------
See p.254 in [1]
"""
params = np.asarray(params)
b = params[0 : self.k_linear]
bw = params[self.k_linear:]
LOO_X = LeaveOneOut(self.exog)
LOO_Y = LeaveOneOut(self.endog).__iter__()
LOO_Z = LeaveOneOut(self.exog_nonparametric).__iter__()
Xb = np.dot(self.exog, b)[:,None]
L = 0
for ii, X_not_i in enumerate(LOO_X):
Y = next(LOO_Y)
Z = next(LOO_Z)
Xb_j = np.dot(X_not_i, b)[:,None]
Yx = Y - Xb_j
G = self.func(bw, endog=Yx, exog=-Z,
data_predict=-self.exog_nonparametric[ii, :])[0]
lt = Xb[ii, :] #.sum() # linear term
L += (self.endog[ii] - lt - G) ** 2
return L
[docs] def fit(self, exog_predict=None, exog_nonparametric_predict=None):
"""Computes fitted values and marginal effects"""
if exog_predict is None:
exog_predict = self.exog
else:
exog_predict = _adjust_shape(exog_predict, self.k_linear)
if exog_nonparametric_predict is None:
exog_nonparametric_predict = self.exog_nonparametric
else:
exog_nonparametric_predict = _adjust_shape(exog_nonparametric_predict, self.K)
N_data_predict = np.shape(exog_nonparametric_predict)[0]
mean = np.empty((N_data_predict,))
mfx = np.empty((N_data_predict, self.K))
Y = self.endog - np.dot(exog_predict, self.b)[:,None]
for i in range(N_data_predict):
mean_mfx = self.func(self.bw, Y, self.exog_nonparametric,
data_predict=exog_nonparametric_predict[i, :])
mean[i] = mean_mfx[0]
mfx_c = np.squeeze(mean_mfx[1])
mfx[i, :] = mfx_c
return mean, mfx
def __repr__(self):
"""Provide something sane to print."""
repr = "Semiparamatric Partially Linear Model \n"
repr += "Number of variables: K = " + str(self.K) + "\n"
repr += "Number of samples: N = " + str(self.nobs) + "\n"
repr += "Variable types: " + self.var_type + "\n"
repr += "BW selection method: cv_ls" + "\n"
repr += "Estimator type: local constant" + "\n"
return repr