Source code for statsmodels.stats.stattools

"""
Statistical tests to be used in conjunction with the models

Notes
-----
These functions haven't been formally tested.
"""

from scipy import stats
import numpy as np


# TODO: these are pretty straightforward but they should be tested
[docs]def durbin_watson(resids, axis=0): """ Calculates the Durbin-Watson statistic Parameters ----------- resids : array-like Returns -------- dw : float, array-like The Durbin-Watson statistic. Notes ----- The null hypothesis of the test is that there is no serial correlation. The Durbin-Watson test statistics is defined as: .. math:: \sum_{t=2}^T((e_t - e_{t-1})^2)/\sum_{t=1}^Te_t^2 The test statistic is approximately equal to 2*(1-r) where ``r`` is the sample autocorrelation of the residuals. Thus, for r == 0, indicating no serial correlation, the test statistic equals 2. This statistic will always be between 0 and 4. The closer to 0 the statistic, the more evidence for positive serial correlation. The closer to 4, the more evidence for negative serial correlation. """ resids = np.asarray(resids) diff_resids = np.diff(resids, 1, axis=axis) dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis) return dw
[docs]def omni_normtest(resids, axis=0): """ Omnibus test for normality Parameters ----------- resid : array-like axis : int, optional Default is 0 Returns ------- Chi^2 score, two-tail probability """ #TODO: change to exception in summary branch and catch in summary() #behavior changed between scipy 0.9 and 0.10 resids = np.asarray(resids) n = resids.shape[axis] if n < 8: from warnings import warn warn("omni_normtest is not valid with less than 8 observations; %i " "samples were given." % int(n)) return np.nan, np.nan return stats.normaltest(resids, axis=axis)
[docs]def jarque_bera(resids, axis=0): """ Calculate residual skewness, kurtosis, and do the JB test for normality Parameters ----------- resids : array-like axis : int, optional Default is 0 Returns ------- JB, JBpv, skew, kurtosis JB = n/6*(S^2 + (K-3)^2/4) JBpv is the Chi^2 two-tail probability value skew is the measure of skewness kurtosis is the measure of kurtosis """ resids = np.asarray(resids) # Calculate residual skewness and kurtosis skew = stats.skew(resids, axis=axis) kurtosis = 3 + stats.kurtosis(resids, axis=axis) # Calculate the Jarque-Bera test for normality n = resids.shape[axis] jb = (n / 6.) * (skew**2 + (1 / 4.) * (kurtosis - 3)**2) jb_pv = stats.chi2.sf(jb, 2) return jb, jb_pv, skew, kurtosis
[docs]def robust_skewness(y, axis=0): """ Calculates the four skewness measures in Kim & White Parameters ---------- y : array-like axis : int or None, optional Axis along which the skewness measures are computed. If `None`, the entire array is used. Returns ------- sk1 : ndarray The standard skewness estimator. sk2 : ndarray Skewness estimator based on quartiles. sk3 : ndarray Skewness estimator based on mean-median difference, standardized by absolute deviation. sk4 : ndarray Skewness estimator based on mean-median difference, standardized by standard deviation. Notes ----- The robust skewness measures are defined .. math :: SK_{2}=\\frac{\\left(q_{.75}-q_{.5}\\right) -\\left(q_{.5}-q_{.25}\\right)}{q_{.75}-q_{.25}} .. math :: SK_{3}=\\frac{\\mu-\\hat{q}_{0.5}} {\\hat{E}\\left[\\left|y-\\hat{\\mu}\\right|\\right]} .. math :: SK_{4}=\\frac{\\mu-\\hat{q}_{0.5}}{\\hat{\\sigma}} .. [1] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ if axis is None: y = y.ravel() axis = 0 y = np.sort(y, axis) q1, q2, q3 = np.percentile(y, [25.0, 50.0, 75.0], axis=axis) mu = y.mean(axis) shape = (y.size,) if axis is not None: shape = list(mu.shape) shape.insert(axis, 1) shape = tuple(shape) mu_b = np.reshape(mu, shape) q2_b = np.reshape(q2, shape) sigma = np.mean(((y - mu_b)**2), axis) sk1 = stats.skew(y, axis=axis) sk2 = (q1 + q3 - 2.0 * q2) / (q3 - q1) sk3 = (mu - q2) / np.mean(abs(y - q2_b), axis=axis) sk4 = (mu - q2) / sigma return sk1, sk2, sk3, sk4
def _kr3(y, alpha=5.0, beta=50.0): """ KR3 estimator from Kim & White Parameters ---------- y : array-like, 1-d alpha : float, optional Lower cut-off for measuring expectation in tail. beta : float, optional Lower cut-off for measuring expectation in center. Returns ------- kr3 : float Robust kurtosis estimator based on standardized lower- and upper-tail expected values Notes ----- .. [1] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ perc = (alpha, 100.0 - alpha, beta, 100.0 - beta) lower_alpha, upper_alpha, lower_beta, upper_beta = np.percentile(y, perc) l_alpha = np.mean(y[y < lower_alpha]) u_alpha = np.mean(y[y > upper_alpha]) l_beta = np.mean(y[y < lower_beta]) u_beta = np.mean(y[y > upper_beta]) return (u_alpha - l_alpha) / (u_beta - l_beta)
[docs]def expected_robust_kurtosis(ab=(5.0, 50.0), dg=(2.5, 25.0)): """ Calculates the expected value of the robust kurtosis measures in Kim and White assuming the data are normally distributed. Parameters ---------- ab: iterable, optional Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail quantile cut-off for measuring the extreme tail and beta is the central quantile cutoff for the standardization of the measure db: iterable, optional Contains 100*(delta, gamma) in the kr4 measure where delta is the tail quantile for measuring extreme values and gamma is the central quantile used in the the standardization of the measure Returns ------- ekr: array, 4-element Contains the expected values of the 4 robust kurtosis measures Notes ----- See `robust_kurtosis` for definitions of the robust kurtosis measures """ alpha, beta = ab delta, gamma = dg expected_value = np.zeros(4) ppf = stats.norm.ppf pdf = stats.norm.pdf q1, q2, q3, q5, q6, q7 = ppf(np.array((1.0, 2.0, 3.0, 5.0, 6.0, 7.0)) / 8) expected_value[0] = 3 expected_value[1] = ((q7 - q5) + (q3 - q1)) / (q6 - q2) q_alpha, q_beta = ppf(np.array((alpha / 100.0, beta / 100.0))) expected_value[2] = (2 * pdf(q_alpha) / alpha) / (2 * pdf(q_beta) / beta) q_delta, q_gamma = ppf(np.array((delta / 100.0, gamma / 100.0))) expected_value[3] = (-2.0 * q_delta) / (-2.0 * q_gamma) return expected_value
[docs]def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True): """ Calculates the four kurtosis measures in Kim & White Parameters ---------- y : array-like axis : int or None, optional Axis along which the kurtoses are computed. If `None`, the entire array is used. ab: iterable, optional Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail quantile cut-off for measuring the extreme tail and beta is the central quantile cutoff for the standardization of the measure db: iterable, optional Contains 100*(delta, gamma) in the kr4 measure where delta is the tail quantile for measuring extreme values and gamma is the central quantile used in the the standardization of the measure excess : bool, optional If true (default), computed values are excess of those for a standard normal distribution. Returns ------- kr1 : ndarray The standard kurtosis estimator. kr2 : ndarray Kurtosis estimator based on octiles. kr3 : ndarray Kurtosis estimators based on exceedence expectations. kr4 : ndarray Kurtosis measure based on the spread between high and low quantiles. Notes ----- The robust kurtosis measures are defined .. math:: KR_{2}=\\frac{\\left(\\hat{q}_{.875}-\\hat{q}_{.625}\\right) +\\left(\\hat{q}_{.375}-\\hat{q}_{.125}\\right)} {\\hat{q}_{.75}-\\hat{q}_{.25}} .. math:: KR_{3}=\\frac{\\hat{E}\\left(y|y>\\hat{q}_{1-\\alpha}\\right) -\\hat{E}\\left(y|y<\\hat{q}_{\\alpha}\\right)} {\\hat{E}\\left(y|y>\\hat{q}_{1-\\beta}\\right) -\\hat{E}\\left(y|y<\\hat{q}_{\\beta}\\right)} .. math:: KR_{4}=\\frac{\\hat{q}_{1-\\delta}-\\hat{q}_{\\delta}} {\\hat{q}_{1-\\gamma}-\\hat{q}_{\\gamma}} where :math:`\\hat{q}_{p}` is the estimated quantile at :math:`p`. .. [1] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ if (axis is None or (y.squeeze().ndim == 1 and y.ndim != 1)): y = y.ravel() axis = 0 alpha, beta = ab delta, gamma = dg perc = (12.5, 25.0, 37.5, 62.5, 75.0, 87.5, delta, 100.0 - delta, gamma, 100.0 - gamma) e1, e2, e3, e5, e6, e7, fd, f1md, fg, f1mg = np.percentile(y, perc, axis=axis) expected_value = expected_robust_kurtosis(ab, dg) if excess else np.zeros(4) kr1 = stats.kurtosis(y, axis, False) - expected_value[0] kr2 = ((e7 - e5) + (e3 - e1)) / (e6 - e2) - expected_value[1] if y.ndim == 1: kr3 = _kr3(y, alpha, beta) else: kr3 = np.apply_along_axis(_kr3, axis, y, alpha, beta) kr3 -= expected_value[2] kr4 = (f1md - fd) / (f1mg - fg) - expected_value[3] return kr1, kr2, kr3, kr4
def _medcouple_1d(y): """ Calculates the medcouple robust measure of skew. Parameters ---------- y : array-like, 1-d Returns ------- mc : float The medcouple statistic Notes ----- The current algorithm requires a O(N**2) memory allocations, and so may not work for very large arrays (N>10000). .. [1] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed distributions" Computational Statistics & Data Analysis, vol. 52, pp. 5186-5201, August 2008. """ # Parameter changes the algorithm to the slower for large n y = np.squeeze(np.asarray(y)) if y.ndim != 1: raise ValueError("y must be squeezable to a 1-d array") y = np.sort(y) n = y.shape[0] if n % 2 == 0: mf = (y[n // 2 - 1] + y[n // 2]) / 2 else: mf = y[(n - 1) // 2] z = y - mf lower = z[z <= 0.0] upper = z[z >= 0.0] upper = upper[:, None] standardization = upper - lower is_zero = np.logical_and(lower == 0.0, upper == 0.0) standardization[is_zero] = np.inf spread = upper + lower return np.median(spread / standardization)
[docs]def medcouple(y, axis=0): """ Calculates the medcouple robust measure of skew. Parameters ---------- y : array-like axis : int or None, optional Axis along which the medcouple statistic is computed. If `None`, the entire array is used. Returns ------- mc : ndarray The medcouple statistic with the same shape as `y`, with the specified axis removed. Notes ----- The current algorithm requires a O(N**2) memory allocations, and so may not work for very large arrays (N>10000). .. [1] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed distributions" Computational Statistics & Data Analysis, vol. 52, pp. 5186-5201, August 2008. """ if axis is None: return _medcouple_1d(y.ravel()) return np.apply_along_axis(_medcouple_1d, axis, y)