Source code for statsmodels.sandbox.distributions.mv_measures

'''using multivariate dependence and divergence measures

The standard correlation coefficient measures only linear dependence between
random variables.
kendall's tau measures any monotonic relationship also non-linear.

mutual information measures any kind of dependence, but does not distinguish
between positive and negative relationship


mutualinfo_kde and mutualinfo_binning follow Khan et al. 2007

Shiraj Khan, Sharba Bandyopadhyay, Auroop R. Ganguly, Sunil Saigal,
David J. Erickson, III, Vladimir Protopopescu, and George Ostrouchov,
Relative performance of mutual information estimation methods for
quantifying the dependence among short and noisy data,
Phys. Rev. E 76, 026209 (2007)
http://pre.aps.org/abstract/PRE/v76/i2/e026209


'''

import numpy as np
from scipy import stats
from scipy.stats import gaussian_kde

import statsmodels.sandbox.infotheo as infotheo


[docs]def mutualinfo_kde(y, x, normed=True):
    '''mutual information of two random variables estimated with kde

    '''
    nobs = len(x)
    if not len(y) == nobs:
        raise ValueError('both data arrays need to have the same size')
    x = np.asarray(x, float)
    y = np.asarray(y, float)
    yx = np.vstack((y,x))
    kde_x = gaussian_kde(x)(x)
    kde_y = gaussian_kde(y)(y)
    kde_yx = gaussian_kde(yx)(yx)

    mi_obs = np.log(kde_yx) - np.log(kde_x) - np.log(kde_y)
    mi = mi_obs.sum() / nobs
    if normed:
        mi_normed = np.sqrt(1. - np.exp(-2 * mi))
        return mi_normed
    else:
        return mi

[docs]def mutualinfo_kde_2sample(y, x, normed=True):
    '''mutual information of two random variables estimated with kde

    '''
    nobs = len(x)
    x = np.asarray(x, float)
    y = np.asarray(y, float)
    #yx = np.vstack((y,x))
    kde_x = gaussian_kde(x.T)(x.T)
    kde_y = gaussian_kde(y.T)(x.T)
    #kde_yx = gaussian_kde(yx)(yx)

    mi_obs = np.log(kde_x) - np.log(kde_y)
    if len(mi_obs) != nobs:
        raise ValueError("Wrong number of observations")
    mi = mi_obs.mean()
    if normed:
        mi_normed = np.sqrt(1. - np.exp(-2 * mi))
        return mi_normed
    else:
        return mi

[docs]def mutualinfo_binned(y, x, bins, normed=True):
    '''mutual information of two random variables estimated with kde



    Notes
    -----
    bins='auto' selects the number of bins so that approximately 5 observations
    are expected to be in each bin under the assumption of independence. This
    follows roughly the description in Kahn et al. 2007

    '''
    nobs = len(x)
    if not len(y) == nobs:
        raise ValueError('both data arrays need to have the same size')
    x = np.asarray(x, float)
    y = np.asarray(y, float)
    #yx = np.vstack((y,x))


##    fyx, binsy, binsx = np.histogram2d(y, x, bins=bins)
##    fx, binsx_ = np.histogram(x, bins=binsx)
##    fy, binsy_ = np.histogram(y, bins=binsy)

    if bins == 'auto':
        ys = np.sort(y)
        xs = np.sort(x)
        #quantiles = np.array([0,0.25, 0.4, 0.6, 0.75, 1])
        qbin_sqr = np.sqrt(5./nobs)
        quantiles = np.linspace(0, 1, 1./qbin_sqr)
        quantile_index = ((nobs-1)*quantiles).astype(int)
        #move edges so that they don't coincide with an observation
        shift = 1e-6 + np.ones(quantiles.shape)
        shift[0] -= 2*1e-6
        binsy = ys[quantile_index] + shift
        binsx = xs[quantile_index] + shift

    elif np.size(bins) == 1:
        binsy = bins
        binsx = bins
    elif (len(bins) == 2):
        binsy, binsx = bins
##        if np.size(bins[0]) == 1:
##            binsx = bins[0]
##        if np.size(bins[1]) == 1:
##            binsx = bins[1]

    fx, binsx = np.histogram(x, bins=binsx)
    fy, binsy = np.histogram(y, bins=binsy)
    fyx, binsy, binsx = np.histogram2d(y, x, bins=(binsy, binsx))

    pyx = fyx * 1. / nobs
    px = fx * 1. / nobs
    py = fy * 1. / nobs


    mi_obs = pyx * (np.log(pyx+1e-10) - np.log(py)[:,None] - np.log(px))
    mi = mi_obs.sum()

    if normed:
        mi_normed = np.sqrt(1. - np.exp(-2 * mi))
        return mi_normed, (pyx, py, px, binsy, binsx), mi_obs
    else:
        return mi


if __name__ == '__main__':
    import statsmodels.api as sm

    funtype = ['linear', 'quadratic'][1]
    nobs = 200
    sig = 2#5.
    #x = np.linspace(-3, 3, nobs) + np.random.randn(nobs)
    x = np.sort(3*np.random.randn(nobs))
    exog = sm.add_constant(x, prepend=True)
    #y = 0 + np.log(1+x**2) + sig * np.random.randn(nobs)
    if funtype == 'quadratic':
        y = 0 + x**2 + sig * np.random.randn(nobs)
    if funtype == 'linear':
        y = 0 + x + sig * np.random.randn(nobs)

    print('correlation')
    print(np.corrcoef(y,x)[0, 1])
    print('pearsonr', stats.pearsonr(y,x))
    print('spearmanr', stats.spearmanr(y,x))
    print('kendalltau', stats.kendalltau(y,x))

    pxy, binsx, binsy = np.histogram2d(x,y, bins=5)
    px, binsx_ = np.histogram(x, bins=binsx)
    py, binsy_ = np.histogram(y, bins=binsy)
    print('mutualinfo', infotheo.mutualinfo(px*1./nobs, py*1./nobs,
                                            1e-15+pxy*1./nobs, logbase=np.e))

    print('mutualinfo_kde normed', mutualinfo_kde(y,x))
    print('mutualinfo_kde       ', mutualinfo_kde(y,x, normed=False))
    mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
               mutualinfo_binned(y, x, 5, normed=True)
    print('mutualinfo_binned normed', mi_normed)
    print('mutualinfo_binned       ', mi_obs.sum())

    mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
               mutualinfo_binned(y, x, 'auto', normed=True)
    print('auto')
    print('mutualinfo_binned normed', mi_normed)
    print('mutualinfo_binned       ', mi_obs.sum())

    ys = np.sort(y)
    xs = np.sort(x)
    by = ys[((nobs-1)*np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)]
    bx = xs[((nobs-1)*np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)]
    mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
               mutualinfo_binned(y, x, (by,bx), normed=True)
    print('quantiles')
    print('mutualinfo_binned normed', mi_normed)
    print('mutualinfo_binned       ', mi_obs.sum())

    doplot = 1#False
    if doplot:
        import matplotlib.pyplot as plt
        plt.plot(x, y, 'o')
        olsres = sm.OLS(y, exog).fit()
        plt.plot(x, olsres.fittedvalues)