Source code for statsmodels.sandbox.datarich.factormodels

# -*- coding: utf-8 -*-
"""
Created on Sun Nov 14 08:21:41 2010

Author: josef-pktd
License: BSD (3-clause)
"""

from __future__ import print_function
import numpy as np
from numpy.testing import assert_array_almost_equal
import statsmodels.api as sm
from statsmodels.sandbox.tools import pca
from statsmodels.sandbox.tools.cross_val import LeaveOneOut

#converting example Principal Component Regression to a class
#from sandbox/example_pca_regression.py


[docs]class FactorModelUnivariate(object): ''' Todo: check treatment of const, make it optional ? add hasconst (0 or 1), needed when selecting nfact+hasconst options are arguments in calc_factors, should be more public instead cross-validation is slow for large number of observations '''
[docs] def __init__(self, endog, exog): #do this in a superclass? self.endog = np.asarray(endog) self.exog = np.asarray(exog)
[docs] def calc_factors(self, x=None, keepdim=0, addconst=True): '''get factor decomposition of exogenous variables This uses principal component analysis to obtain the factors. The number of factors kept is the maximum that will be considered in the regression. ''' if x is None: x = self.exog else: x = np.asarray(x) xred, fact, evals, evecs = pca(x, keepdim=keepdim, normalize=1) self.exog_reduced = xred #self.factors = fact if addconst: self.factors = sm.add_constant(fact, prepend=True) self.hasconst = 1 #needs to be int else: self.factors = fact self.hasconst = 0 #needs to be int self.evals = evals self.evecs = evecs
[docs] def fit_fixed_nfact(self, nfact): if not hasattr(self, 'factors_wconst'): self.calc_factors() return sm.OLS(self.endog, self.factors[:,:nfact+1]).fit()
[docs] def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None): '''estimate the model and selection criteria for up to maxfact factors The selection criteria that are calculated are AIC, BIC, and R2_adj. and additionally cross-validation prediction error sum of squares if `skip_crossval` is false. Cross-validation is not used by default because it can be time consuming to calculate. By default the cross-validation method is Leave-one-out on the full dataset. A different cross-validation sample can be specified as an argument to cv_iter. Results are attached in `results_find_nfact` ''' #print 'OLS on Factors' if not hasattr(self, 'factors'): self.calc_factors() hasconst = self.hasconst if maxfact is None: maxfact = self.factors.shape[1] - hasconst if (maxfact+hasconst) < 1: raise ValueError('nothing to do, number of factors (incl. constant) should ' + 'be at least 1') #temporary safety maxfact = min(maxfact, 10) y0 = self.endog results = [] #xred, fact, eva, eve = pca(x0, keepdim=0, normalize=1) for k in range(1, maxfact+hasconst): #k includes now the constnat #xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1) # this is faster and same result fact = self.factors[:,:k] res = sm.OLS(y0, fact).fit() ## print 'k =', k ## print res.params ## print 'aic: ', res.aic ## print 'bic: ', res.bic ## print 'llf: ', res.llf ## print 'R2 ', res.rsquared ## print 'R2 adj', res.rsquared_adj if not skip_crossval: if cv_iter is None: cv_iter = LeaveOneOut(len(y0)) prederr2 = 0. for inidx, outidx in cv_iter: res_l1o = sm.OLS(y0[inidx], fact[inidx,:]).fit() #print data.endog[outidx], res.model.predict(data.exog[outidx,:]), prederr2 += (y0[outidx] - res_l1o.model.predict(res_l1o.params, fact[outidx,:]))**2. else: prederr2 = np.nan results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2]) self.results_find_nfact = results = np.array(results) self.best_nfact = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0), np.argmin(results[:,-1],0))]
[docs] def summary_find_nfact(self): '''provides a summary for the selection of the number of factors Returns ------- sumstr : string summary of the results for selecting the number of factors ''' if not hasattr(self, 'results_find_nfact'): self.fit_find_nfact() results = self.results_find_nfact sumstr = '' sumstr += '\n' + 'Best result for k, by AIC, BIC, R2_adj, L1O' # best = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0), # np.argmin(results[:,-1],0))] sumstr += '\n' + ' '*19 + '%5d %4d %6d %5d' % tuple(self.best_nfact) from statsmodels.iolib.table import (SimpleTable, default_txt_fmt, default_latex_fmt, default_html_fmt) headers = 'k, AIC, BIC, R2_adj, L1O'.split(', ') numformat = ['%6d'] + ['%10.3f']*4 #'%10.4f' txt_fmt1 = dict(data_fmts = numformat) tabl = SimpleTable(results, headers, None, txt_fmt=txt_fmt1) sumstr += '\n' + "PCA regression on simulated data," sumstr += '\n' + "DGP: 2 factors and 4 explanatory variables" sumstr += '\n' + tabl.__str__() sumstr += '\n' + "Notes: k is number of components of PCA," sumstr += '\n' + " constant is added additionally" sumstr += '\n' + " k=0 means regression on constant only" sumstr += '\n' + " L1O: sum of squared prediction errors for leave-one-out" return sumstr
if __name__ == '__main__': examples = [1] if 1 in examples: nobs = 500 f0 = np.c_[np.random.normal(size=(nobs,2)), np.ones((nobs,1))] f2xcoef = np.c_[np.repeat(np.eye(2),2,0),np.arange(4)[::-1]].T f2xcoef = np.array([[ 1., 1., 0., 0.], [ 0., 0., 1., 1.], [ 3., 2., 1., 0.]]) f2xcoef = np.array([[ 0.1, 3., 1., 0.], [ 0., 0., 1.5, 0.1], [ 3., 2., 1., 0.]]) x0 = np.dot(f0, f2xcoef) x0 += 0.1*np.random.normal(size=x0.shape) ytrue = np.dot(f0,[1., 1., 1.]) y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape) mod = FactorModelUnivariate(y0, x0) print(mod.summary_find_nfact()) print("with cross validation - slower") mod.fit_find_nfact(maxfact=None, skip_crossval=False, cv_iter=None) print(mod.summary_find_nfact())