"""
Univariate lowess function, like in R.
References
----------
Hastie, Tibshirani, Friedman. (2009) The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition: Chapter 6.
Cleveland, W.S. (1979) "Robust Locally Weighted Regression and Smoothing Scatterplots". Journal of the American Statistical Association 74 (368): 829-836.
"""
from statsmodels.compat.python import range
import numpy as np
from scipy.linalg import lstsq
[docs]def lowess(endog, exog, frac=2./3, it=3):
"""
LOWESS (Locally Weighted Scatterplot Smoothing)
A lowess function that outs smoothed estimates of endog
at the given exog values from points (exog, endog)
Parameters
----------
endog: 1-D numpy array
The y-values of the observed points
exog: 1-D numpy array
The x-values of the observed points
frac: float
Between 0 and 1. The fraction of the data used
when estimating each y-value.
it: int
The number of residual-based reweightings
to perform.
Returns
-------
out: numpy array
A numpy array with two columns. The first column
is the sorted x values and the second column the
associated estimated y-values.
Notes
-----
This lowess function implements the algorithm given in the
reference below using local linear estimates.
Suppose the input data has N points. The algorithm works by
estimating the true ``y_i`` by taking the frac*N closest points
to ``(x_i,y_i)`` based on their x values and estimating ``y_i``
using a weighted linear regression. The weight for ``(x_j,y_j)``
is `_lowess_tricube` function applied to ``|x_i-x_j|``.
If ``iter > 0``, then further weighted local linear regressions
are performed, where the weights are the same as above
times the `_lowess_bisquare` function of the residuals. Each iteration
takes approximately the same amount of time as the original fit,
so these iterations are expensive. They are most useful when
the noise has extremely heavy tails, such as Cauchy noise.
Noise with less heavy-tails, such as t-distributions with ``df > 2``,
are less problematic. The weights downgrade the influence of
points with large residuals. In the extreme case, points whose
residuals are larger than 6 times the median absolute residual
are given weight 0.
Some experimentation is likely required to find a good
choice of frac and iter for a particular dataset.
References
----------
Cleveland, W.S. (1979) "Robust Locally Weighted Regression
and Smoothing Scatterplots". Journal of the American Statistical
Association 74 (368): 829-836.
Examples
--------
The below allows a comparison between how different the fits from
`lowess` for different values of frac can be.
>>> import numpy as np
>>> import statsmodels.api as sm
>>> lowess = sm.nonparametric.lowess
>>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500)
>>> y = np.sin(x) + np.random.normal(size=len(x))
>>> z = lowess(y, x)
>>> w = lowess(y, x, frac=1./3)
This gives a similar comparison for when it is 0 vs not.
>>> import scipy.stats as stats
>>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500)
>>> y = np.sin(x) + stats.cauchy.rvs(size=len(x))
>>> z = lowess(y, x, frac= 1./3, it=0)
>>> w = lowess(y, x, frac=1./3)
"""
x = exog
if exog.ndim != 1:
raise ValueError('exog must be a vector')
if endog.ndim != 1:
raise ValueError('endog must be a vector')
if endog.shape[0] != x.shape[0] :
raise ValueError('exog and endog must have same length')
n = exog.shape[0]
fitted = np.zeros(n)
k = int(frac * n)
index_array = np.argsort(exog)
x_copy = np.array(exog[index_array]) #, dtype ='float32')
y_copy = endog[index_array]
fitted, weights = _lowess_initial_fit(x_copy, y_copy, k, n)
for i in range(it):
_lowess_robustify_fit(x_copy, y_copy, fitted,
weights, k, n)
out = np.array([x_copy, fitted]).T
out.shape = (n,2)
return out
def _lowess_initial_fit(x_copy, y_copy, k, n):
"""
The initial weighted local linear regression for lowess.
Parameters
----------
x_copy : 1-d ndarray
The x-values/exogenous part of the data being smoothed
y_copy : 1-d ndarray
The y-values/ endogenous part of the data being smoothed
k : int
The number of data points which affect the linear fit for
each estimated point
n : int
The total number of points
Returns
-------
fitted : 1-d ndarray
The fitted y-values
weights : 2-d ndarray
An n by k array. The contribution to the weights in the
local linear fit coming from the distances between the
x-values
"""
weights = np.zeros((n,k), dtype = x_copy.dtype)
nn_indices = [0,k]
X = np.ones((k,2))
fitted = np.zeros(n)
for i in range(n):
#note: all _lowess functions are inplace, no return
left_width = x_copy[i] - x_copy[nn_indices[0]]
right_width = x_copy[nn_indices[1]-1] - x_copy[i]
width = max(left_width, right_width)
_lowess_wt_standardize(weights[i,:],
x_copy[nn_indices[0]:nn_indices[1]],
x_copy[i], width)
_lowess_tricube(weights[i,:])
weights[i,:] = np.sqrt(weights[i,:])
X[:,1] = x_copy[nn_indices[0]:nn_indices[1]]
y_i = weights[i,:] * y_copy[nn_indices[0]:nn_indices[1]]
beta = lstsq(weights[i,:].reshape(k,1) * X, y_i)[0]
fitted[i] = beta[0] + beta[1]*x_copy[i]
_lowess_update_nn(x_copy, nn_indices, i+1)
return fitted, weights
def _lowess_wt_standardize(weights, new_entries, x_copy_i, width):
"""
The initial phase of creating the weights.
Subtract the current x_i and divide by the width.
Parameters
----------
weights : ndarray
The memory where (new_entries - x_copy_i)/width will be placed
new_entries : ndarray
The x-values of the k closest points to x[i]
x_copy_i : float
x[i], the i'th point in the (sorted) x values
width : float
The maximum distance between x[i] and any point in new_entries
Returns
-------
Nothing. The modifications are made to weight in place.
"""
weights[:] = new_entries
weights -= x_copy_i
weights /= width
def _lowess_robustify_fit(x_copy, y_copy, fitted, weights, k, n):
"""
Additional weighted local linear regressions, performed if
iter>0. They take into account the sizes of the residuals,
to eliminate the effect of extreme outliers.
Parameters
----------
x_copy : 1-d ndarray
The x-values/exogenous part of the data being smoothed
y_copy : 1-d ndarray
The y-values/ endogenous part of the data being smoothed
fitted : 1-d ndarray
The fitted y-values from the previous iteration
weights : 2-d ndarray
An n by k array. The contribution to the weights in the
local linear fit coming from the distances between the
x-values
k : int
The number of data points which affect the linear fit for
each estimated point
n : int
The total number of points
Returns
-------
Nothing. The fitted values are modified in place.
"""
nn_indices = [0,k]
X = np.ones((k,2))
residual_weights = np.copy(y_copy)
residual_weights.shape = (n,)
residual_weights -= fitted
residual_weights = np.absolute(residual_weights)#, out=residual_weights)
s = np.median(residual_weights)
residual_weights /= (6*s)
too_big = residual_weights>=1
_lowess_bisquare(residual_weights)
residual_weights[too_big] = 0
for i in range(n):
total_weights = weights[i,:] * np.sqrt(residual_weights[nn_indices[0]:
nn_indices[1]])
X[:,1] = x_copy[nn_indices[0]:nn_indices[1]]
y_i = total_weights * y_copy[nn_indices[0]:nn_indices[1]]
total_weights.shape = (k,1)
beta = lstsq(total_weights * X, y_i)[0]
fitted[i] = beta[0] + beta[1] * x_copy[i]
_lowess_update_nn(x_copy, nn_indices, i+1)
def _lowess_update_nn(x, cur_nn,i):
"""
Update the endpoints of the nearest neighbors to
the ith point.
Parameters
----------
x : iterable
The sorted points of x-values
cur_nn : list of length 2
The two current indices between which are the
k closest points to x[i]. (The actual value of
k is irrelevant for the algorithm.
i : int
The index of the current value in x for which
the k closest points are desired.
Returns
-------
Nothing. It modifies cur_nn in place.
"""
while True:
if cur_nn[1]<x.size:
left_dist = x[i] - x[cur_nn[0]]
new_right_dist = x[cur_nn[1]] - x[i]
if new_right_dist < left_dist:
cur_nn[0] = cur_nn[0] + 1
cur_nn[1] = cur_nn[1] + 1
else:
break
else:
break
def _lowess_tricube(t):
"""
The _tricube function applied to a numpy array.
The tricube function is (1-abs(t)**3)**3.
Parameters
----------
t : ndarray
Array the tricube function is applied to elementwise and
in-place.
Returns
-------
Nothing
"""
#t = (1-np.abs(t)**3)**3
t[:] = np.absolute(t) #, out=t) #numpy version?
_lowess_mycube(t)
t[:] = np.negative(t) #, out = t)
t += 1
_lowess_mycube(t)
def _lowess_mycube(t):
"""
Fast matrix cube
Parameters
----------
t : ndarray
Array that is cubed, elementwise and in-place
Returns
-------
Nothing
"""
#t **= 3
t2 = t*t
t *= t2
def _lowess_bisquare(t):
"""
The bisquare function applied to a numpy array.
The bisquare function is (1-t**2)**2.
Parameters
----------
t : ndarray
array bisquare function is applied to, element-wise and in-place.
Returns
-------
Nothing
"""
#t = (1-t**2)**2
t *= t
t[:] = np.negative(t) #, out=t)
t += 1
t *= t