Source code for statsmodels.sandbox.stats.stats_dhuard

'''
from David Huard's scipy sandbox, also attached to a ticket and
in the matplotlib-user mailinglist  (links ???)


Notes
=====

out of bounds interpolation raises exception and wouldn't be completely
defined ::

>>> scoreatpercentile(x, [0,25,50,100])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.
>>> percentileofscore(x, [-50, 50])
Traceback (most recent call last):
...
    raise ValueError("A value in x_new is below the interpolation "
ValueError: A value in x_new is below the interpolation range.


idea
====

histogram and empirical interpolated distribution
-------------------------------------------------

dual constructor
* empirical cdf : cdf on all observations through linear interpolation
* binned cdf : based on histogram
both should work essentially the same, although pdf of empirical has
many spikes, fluctuates a lot
- alternative: binning based on interpolated cdf : example in script
* ppf: quantileatscore based on interpolated cdf
* rvs : generic from ppf
* stats, expectation ? how does integration wrt cdf work - theory?

Problems
* limits, lower and upper bound of support
  does not work or is undefined with empirical cdf and interpolation
* extending bounds ?
  matlab has pareto tails for empirical distribution, breaks linearity

empirical distribution with higher order interpolation
------------------------------------------------------

* should work easily enough with interpolating splines
* not piecewise linear
* can use pareto (or other) tails
* ppf how do I get the inverse function of a higher order spline?
  Chuck: resample and fit spline to inverse function
  this will have an approximation error in the inverse function
* -> doesn't work: higher order spline doesn't preserve monotonicity
  see mailing list for response to my question
* pmf from derivative available in spline

-> forget this and use kernel density estimator instead


bootstrap/empirical distribution:
---------------------------------

discrete distribution on real line given observations
what's defined?
* cdf : step function
* pmf : points with equal weight 1/nobs
* rvs : resampling
* ppf : quantileatscore on sample?
* moments : from data ?
* expectation ? sum_{all observations x} [func(x) * pmf(x)]
* similar for discrete distribution on real line
* References : ?
* what's the point? most of it is trivial, just for the record ?


Created on Monday, May 03, 2010, 11:47:03 AM
Author: josef-pktd, parts based on David Huard
License: BSD

'''
from __future__ import print_function
import scipy.interpolate as interpolate
import numpy as np

[docs]def scoreatpercentile(data, percentile): """Return the score at the given percentile of the data. Example: >>> data = randn(100) >>> scoreatpercentile(data, 50) will return the median of sample `data`. """ per = np.array(percentile) cdf = empiricalcdf(data) interpolator = interpolate.interp1d(np.sort(cdf), np.sort(data)) return interpolator(per/100.)
[docs]def percentileofscore(data, score): """Return the percentile-position of score relative to data. score: Array of scores at which the percentile is computed. Return percentiles (0-100). Example r = randn(50) x = linspace(-2,2,100) percentileofscore(r,x) Raise an error if the score is outside the range of data. """ cdf = empiricalcdf(data) interpolator = interpolate.interp1d(np.sort(data), np.sort(cdf)) return interpolator(score)*100.
[docs]def empiricalcdf(data, method='Hazen'): """Return the empirical cdf. Methods available: Hazen: (i-0.5)/N Weibull: i/(N+1) Chegodayev: (i-.3)/(N+.4) Cunnane: (i-.4)/(N+.2) Gringorten: (i-.44)/(N+.12) California: (i-1)/N Where i goes from 1 to N. """ i = np.argsort(np.argsort(data)) + 1. N = len(data) method = method.lower() if method == 'hazen': cdf = (i-0.5)/N elif method == 'weibull': cdf = i/(N+1.) elif method == 'california': cdf = (i-1.)/N elif method == 'chegodayev': cdf = (i-.3)/(N+.4) elif method == 'cunnane': cdf = (i-.4)/(N+.2) elif method == 'gringorten': cdf = (i-.44)/(N+.12) else: raise ValueError('Unknown method. Choose among Weibull, Hazen,' 'Chegodayev, Cunnane, Gringorten and California.') return cdf
[docs]class HistDist(object): '''Distribution with piecewise linear cdf, pdf is step function can be created from empiricial distribution or from a histogram (not done yet) work in progress, not finished '''
[docs] def __init__(self, data): self.data = np.atleast_1d(data) self.binlimit = np.array([self.data.min(), self.data.max()]) sortind = np.argsort(data) self._datasorted = data[sortind] self.ranking = np.argsort(sortind) cdf = self.empiricalcdf() self._empcdfsorted = np.sort(cdf) self.cdfintp = interpolate.interp1d(self._datasorted, self._empcdfsorted) self.ppfintp = interpolate.interp1d(self._empcdfsorted, self._datasorted)
[docs] def empiricalcdf(self, data=None, method='Hazen'): """Return the empirical cdf. Methods available: Hazen: (i-0.5)/N Weibull: i/(N+1) Chegodayev: (i-.3)/(N+.4) Cunnane: (i-.4)/(N+.2) Gringorten: (i-.44)/(N+.12) California: (i-1)/N Where i goes from 1 to N. """ if data is None: data = self.data i = self.ranking else: i = np.argsort(np.argsort(data)) + 1. N = len(data) method = method.lower() if method == 'hazen': cdf = (i-0.5)/N elif method == 'weibull': cdf = i/(N+1.) elif method == 'california': cdf = (i-1.)/N elif method == 'chegodayev': cdf = (i-.3)/(N+.4) elif method == 'cunnane': cdf = (i-.4)/(N+.2) elif method == 'gringorten': cdf = (i-.44)/(N+.12) else: raise ValueError('Unknown method. Choose among Weibull, Hazen,' 'Chegodayev, Cunnane, Gringorten and California.') return cdf
[docs] def cdf_emp(self, score): ''' this is score in dh ''' return self.cdfintp(score)
#return percentileofscore(self.data, score)
[docs] def ppf_emp(self, quantile): ''' this is score in dh ''' return self.ppfintp(quantile)
#return scoreatpercentile(self.data, quantile*100) #from DHuard http://old.nabble.com/matplotlib-f2903.html
[docs] def optimize_binning(self, method='Freedman'): """Find the optimal number of bins and update the bin countaccordingly. Available methods : Freedman Scott """ nobs = len(self.data) if method=='Freedman': IQR = self.ppf_emp(0.75) - self.ppf_emp(0.25) # Interquantile range(75% -25%) width = 2* IQR* nobs**(-1./3) elif method=='Scott': width = 3.49 * np.std(self.data) * nobs**(-1./3) self.nbin = (self.binlimit.ptp()/width) return self.nbin
#changes: josef-pktd if __name__ == '__main__': import matplotlib.pyplot as plt nobs = 100 x = np.random.randn(nobs) examples = [2] if 1 in examples: empiricalcdf(x) print(percentileofscore(x, 0.5)) print(scoreatpercentile(x, 50)) import matplotlib.pyplot as plt xsupp = np.linspace(x.min(), x.max()) pos = percentileofscore(x, xsupp) plt.plot(xsupp, pos) #perc = np.linspace(2.5, 97.5) #plt.plot(scoreatpercentile(x, perc), perc) plt.plot(scoreatpercentile(x, pos), pos+1) #emp = interpolate.PiecewisePolynomial(np.sort(empiricalcdf(x)), np.sort(x)) emp=interpolate.InterpolatedUnivariateSpline(np.sort(x),np.sort(empiricalcdf(x)),k=1) pdfemp = np.array([emp.derivatives(xi)[1] for xi in xsupp]) plt.figure() plt.plot(xsupp,pdfemp) cdf_ongrid = emp(xsupp) plt.figure() plt.plot(xsupp, cdf_ongrid) #get pdf from interpolated cdf on a regular grid plt.figure() plt.step(xsupp[:-1],np.diff(cdf_ongrid)/np.diff(xsupp)) #reduce number of bins/steps xsupp2 = np.linspace(x.min(), x.max(), 25) plt.figure() plt.step(xsupp2[:-1],np.diff(emp(xsupp2))/np.diff(xsupp2)) #pdf using 25 original observations, every (nobs/25)th xso = np.sort(x) xs = xso[::nobs/25] plt.figure() plt.step(xs[:-1],np.diff(emp(xs))/np.diff(xs)) #lower end looks strange histd = HistDist(x) print(histd.optimize_binning()) print(histd.cdf_emp(histd.binlimit)) print(histd.ppf_emp([0.25, 0.5, 0.75])) print(histd.cdf_emp([-0.5, -0.25, 0, 0.25, 0.5])) xsupp = np.linspace(x.min(), x.max(), 500) emp=interpolate.InterpolatedUnivariateSpline(np.sort(x),np.sort(empiricalcdf(x)),k=1) #pdfemp = np.array([emp.derivatives(xi)[1] for xi in xsupp]) #plt.figure() #plt.plot(xsupp,pdfemp) cdf_ongrid = emp(xsupp) plt.figure() plt.plot(xsupp, cdf_ongrid) ppfintp = interpolate.InterpolatedUnivariateSpline(cdf_ongrid,xsupp,k=3) ppfs = ppfintp(cdf_ongrid) plt.plot(ppfs, cdf_ongrid) #ppfemp=interpolate.InterpolatedUnivariateSpline(np.sort(empiricalcdf(x)),np.sort(x),k=3) #Don't use interpolating splines for function approximation #with s=0.03 the spline is monotonic at the evaluated values ppfemp=interpolate.UnivariateSpline(np.sort(empiricalcdf(x)),np.sort(x),k=3, s=0.03) ppfe = ppfemp(cdf_ongrid) plt.plot(ppfe, cdf_ongrid) print('negative density') print('(np.diff(ppfs)).min()', (np.diff(ppfs)).min()) print('(np.diff(cdf_ongrid)).min()', (np.diff(cdf_ongrid)).min()) #plt.show()