Source code for statsmodels.stats.multivariate_tools

'''Tools for multivariate analysis


Author : Josef Perktold
License : BSD-3



TODO:

- names of functions, currently just "working titles"

'''



import numpy as np

# temporarily here, used in return
[docs]class Bunch(dict):
[docs]    def __init__(self, **kw):
        dict.__init__(self, kw)
        self.__dict__  = self


[docs]def partial_project(endog, exog):
    '''helper function to get linear projection or partialling out of variables

    endog variables are projected on exog variables

    Parameters
    ----------
    endog : ndarray
        array of variables where the effect of exog is partialled out.
    exog : ndarray
        array of variables on which the endog variables are projected.

    Returns
    -------
    res : instance of Bunch with

        - params : OLS parameter estimates from projection of endog on exog
        - fittedvalues : predicted values of endog given exog
        - resid : residual of the regression, values of endog with effect of
          exog partialled out

    Notes
    -----
    This is no-frills mainly for internal calculations, no error checking or
    array conversion is performed, at least for now.

    '''
    x1, x2 = endog, exog
    params = np.linalg.pinv(x2).dot(x1)
    predicted = x2.dot(params)
    residual = x1 - predicted
    res = Bunch(params=params,
                fittedvalues=predicted,
                resid=residual)

    return res



[docs]def cancorr(x1, x2, demean=True, standardize=False):
    '''canonical correlation coefficient beween 2 arrays

    Parameters
    ----------
    x1, x2 : ndarrays, 2_D
        two 2-dimensional data arrays, observations in rows, variables in columns
    demean : bool
         If demean is true, then the mean is subtracted from each variable
    standardize : bool
         If standardize is true, then each variable is demeaned and divided by
         its standard deviation. Rescaling does not change the canonical
         correlation coefficients.

    Returns
    -------
    ccorr : ndarray, 1d
        canonical correlation coefficients, sorted from largest to smallest.
        Note, that these are the square root of the eigenvalues.

    Notes
    -----
    This is a helper function for other statistical functions. It only
    calculates the canonical correlation coefficients and does not do a full
    canoncial correlation analysis

    The canonical correlation coefficient is calculated with the generalized
    matrix inverse and does not raise an exception if one of the data arrays
    have less than full column rank.

    See Also
    --------
    cc_ranktest
    cc_stats
    CCA not yet

    '''
    #x, y = x1, x2
    if demean or standardize:
        x1 = (x1 - x1.mean(0))
        x2 = (x2 - x2.mean(0))

    if standardize:
        #std doesn't make a difference to canonical correlation coefficients
        x1 /= x1.std(0)
        x2 /= x2.std(0)

    t1 = np.linalg.pinv(x1).dot(x2)
    t2 = np.linalg.pinv(x2).dot(x1)
    m = t1.dot(t2)
    cc = np.sqrt(np.linalg.eigvals(m))
    cc.sort()
    return cc[::-1]


[docs]def cc_ranktest(x1, x2, demean=True, fullrank=False):
    '''rank tests based on smallest canonical correlation coefficients

    Anderson canonical correlations test (LM test) and
    Cragg-Donald test (Wald test)
    Assumes homoskedasticity and independent observations, overrejects if
    there is heteroscedasticity or autocorrelation.

    The Null Hypothesis is that the rank is k - 1, the alternative hypothesis
    is that the rank is at least k.


    Parameters
    ----------
    x1, x2 : ndarrays, 2_D
        two 2-dimensional data arrays, observations in rows, variables in columns
    demean : bool
         If demean is true, then the mean is subtracted from each variable.
    fullrank : bool
         If true, then only the test that the matrix has full rank is returned.
         If false, the test for all possible ranks are returned. However, no
         the p-values are not corrected for the multiplicity of tests.

    Returns
    -------
    value : float
        value of the test statistic
    p-value : float
        p-value for the test Null Hypothesis tha the smallest canonical
        correlation coefficient is zero. based on chi-square distribution
    df : int
        degrees of freedom for thechi-square distribution in the hypothesis test
    ccorr : ndarray, 1d
        All canonical correlation coefficients sorted from largest to smallest.

    Notes
    -----
    Degrees of freedom for the distribution of the test statistic are based on
    number of columns of x1 and x2 and not on their matrix rank.
    (I'm not sure yet what the interpretation of the test is if x1 or x2 are of
    reduced rank.)

    See Also
    --------
    cancorr
    cc_stats

    '''

    from scipy import stats

    nobs1, k1 = x1.shape
    nobs2, k2 = x2.shape

    cc = cancorr(x1, x2, demean=demean)
    cc2 = cc * cc
    if fullrank:
        df = np.abs(k1 - k2) + 1
        value = nobs1 * cc2[-1]
        w_value = nobs1 * (cc2[-1] / (1. - cc2[-1]))
        return value, stats.chi2.sf(value, df), df, cc, w_value, stats.chi2.sf(w_value, df)
    else:
        r = np.arange(min(k1, k2))[::-1]
        df = (k1 - r) * (k2 - r)
        values = nobs1 * cc2[::-1].cumsum()
        w_values = nobs1 * (cc2 / (1. - cc2))[::-1].cumsum()
        return values, stats.chi2.sf(values, df), df, cc, w_values, stats.chi2.sf(w_values, df)


[docs]def cc_stats(x1, x2, demean=True):
    '''MANOVA statistics based on canonical correlation coefficient

    Calculates Pillai's Trace, Wilk's Lambda, Hotelling's Trace and
    Roy's Largest Root.

    Parameters
    ----------
    x1, x2 : ndarrays, 2_D
        two 2-dimensional data arrays, observations in rows, variables in columns
    demean : bool
         If demean is true, then the mean is subtracted from each variable.

    Returns
    -------
    res : dict
        Dictionary containing the test statistics.

    Notes
    -----

    same as `canon` in Stata

    missing: F-statistics and p-values

    TODO: should return a results class instead
    produces nans sometimes, singular, perfect correlation of x1, x2 ?

    '''

    nobs1, k1 = x1.shape  # endogenous ?
    nobs2, k2 = x2.shape
    cc = cancorr(x1, x2, demean=demean)
    cc2 = cc**2
    lam = (cc2 / (1 - cc2))  # what if max cc2 is 1 ?
    # Problem: ccr might not care if x1 or x2 are reduced rank,
    #          but df will depend on rank
    df_model = k1 * k2  # df_hypothesis (we do not include mean in x1, x2)
    df_resid = k1 * (nobs1 - k2 - demean)
    s = min(df_model, k1)
    m = 0.5 * (df_model - k1)
    n = 0.5 * (df_resid - k1 - 1)

    df1 = k1 * df_model
    df2 = k2


    pt_value = cc2.sum()    # Pillai's trace
    wl_value = np.product(1 / (1 + lam))   # Wilk's Lambda
    ht_value = lam.sum()    # Hotelling's Trace
    rm_value = lam.max()    # Roy's largest root
    #from scipy import stats
    # what's the distribution, the test statistic ?
    res = {}
    res['canonical correlation coefficient'] = cc
    res['eigenvalues'] = lam
    res["Pillai's Trace"] = pt_value
    res["Wilk's Lambda"] = wl_value
    res["Hotelling's Trace"] = ht_value
    res["Roy's Largest Root"] = rm_value
    res['df_resid'] = df_resid
    res['df_m'] = m
    return res