'''runstest
formulas for mean and var of runs taken from SAS manual NPAR tests, also idea
for runstest_1samp and runstest_2samp
Description in NIST handbook and dataplot doesn't explain their expected
values, or variance
Note:
There are (at least) two definitions of runs used in literature. The classical
definition which is also used here, is that runs are sequences of identical
observations separated by observations with different realizations.
The second definition allows for overlapping runs, or runs where counting a
run is also started after a run of a fixed length of the same kind.
TODO
* add one-sided tests where possible or where it makes sense
'''
from __future__ import print_function
import numpy as np
from scipy import stats
from scipy.misc import comb
[docs]class Runs(object):
'''class for runs in a binary sequence
Parameters
----------
x : array_like, 1d
data array,
Notes
-----
This was written as a more general class for runs. This has some redundant
calculations when only the runs_test is used.
TODO: make it lazy
The runs test could be generalized to more than 1d if there is a use case
for it.
This should be extended once I figure out what the distribution of runs
of any length k is.
The exact distribution for the runs test is also available but not yet
verified.
'''
[docs] def __init__(self, x):
self.x = np.asarray(x)
self.runstart = runstart = np.nonzero(np.diff(np.r_[[-np.inf], x, [np.inf]]))[0]
self.runs = runs = np.diff(runstart)
self.runs_sign = runs_sign = x[runstart[:-1]]
self.runs_pos = runs[runs_sign==1]
self.runs_neg = runs[runs_sign==0]
self.runs_freqs = np.bincount(runs)
self.n_runs = len(self.runs)
self.n_pos = (x==1).sum()
[docs] def runs_test(self, correction=True):
'''basic version of runs test
Parameters
----------
correction: bool
Following the SAS manual, for samplesize below 50, the test
statistic is corrected by 0.5. This can be turned off with
correction=False, and was included to match R, tseries, which
does not use any correction.
pvalue based on normal distribution, with integer correction
'''
self.npo = npo = (self.runs_pos).sum()
self.nne = nne = (self.runs_neg).sum()
#n_r = self.n_runs
n = npo + nne
npn = npo * nne
rmean = 2. * npn / n + 1
rvar = 2. * npn * (2.*npn - n) / n**2. / (n-1.)
rstd = np.sqrt(rvar)
rdemean = self.n_runs - rmean
if n >= 50 or not correction:
z = rdemean
else:
if rdemean > 0.5:
z = rdemean - 0.5
elif rdemean < 0.5:
z = rdemean + 0.5
else:
z = 0.
z /= rstd
pval = 2 * stats.norm.sf(np.abs(z))
return z, pval
[docs]def runstest_1samp(x, cutoff='mean', correction=True):
'''use runs test on binary discretized data above/below cutoff
Parameters
----------
x : array_like
data, numeric
cutoff : {'mean', 'median'} or number
This specifies the cutoff to split the data into large and small
values.
correction: bool
Following the SAS manual, for samplesize below 50, the test
statistic is corrected by 0.5. This can be turned off with
correction=False, and was included to match R, tseries, which
does not use any correction.
Returns
-------
z_stat : float
test statistic, asymptotically normally distributed
p-value : float
p-value, reject the null hypothesis if it is below an type 1 error
level, alpha .
'''
if cutoff == 'mean':
cutoff = np.mean(x)
elif cutoff == 'median':
cutoff = np.median(x)
xindicator = (x >= cutoff).astype(int)
return Runs(xindicator).runs_test(correction=correction)
[docs]def runstest_2samp(x, y=None, groups=None, correction=True):
'''Wald-Wolfowitz runstest for two samples
This tests whether two samples come from the same distribution.
Parameters
----------
x : array_like
data, numeric, contains either one group, if y is also given, or
both groups, if additionally a group indicator is provided
y : array_like (optional)
data, numeric
groups : array_like
group labels or indicator the data for both groups is given in a
single 1-dimensional array, x. If group labels are not [0,1], then
correction: bool
Following the SAS manual, for samplesize below 50, the test
statistic is corrected by 0.5. This can be turned off with
correction=False, and was included to match R, tseries, which
does not use any correction.
Returns
-------
z_stat : float
test statistic, asymptotically normally distributed
p-value : float
p-value, reject the null hypothesis if it is below an type 1 error
level, alpha .
Notes
-----
Wald-Wolfowitz runs test.
If there are ties, then then the test statistic and p-value that is
reported, is based on the higher p-value between sorting all tied
observations of the same group
This test is intended for continuous distributions
SAS has treatment for ties, but not clear, and sounds more complicated
(minimum and maximum possible runs prevent use of argsort)
(maybe it's not so difficult, idea: add small positive noise to first
one, run test, then to the other, run test, take max(?) p-value - DONE
This gives not the minimum and maximum of the number of runs, but should
be close. Not true, this is close to minimum but far away from maximum.
maximum number of runs would use alternating groups in the ties.)
Maybe adding random noise would be the better approach.
SAS has exact distribution for sample size <=30, doesn't look standard
but should be easy to add.
currently two-sided test only
This has not been verified against a reference implementation. In a short
Monte Carlo simulation where both samples are normally distribute, the test
seems to be correctly sized for larger number of observations (30 or
larger), but conservative (i.e. reject less often than nominal) with a
sample size of 10 in each group.
See Also
--------
runs_test_1samp
Runs
RunsProb
'''
x = np.asarray(x)
if not y is None:
y = np.asarray(y)
groups = np.concatenate((np.zeros(len(x)), np.ones(len(y))))
# note reassigning x
x = np.concatenate((x, y))
gruni = np.arange(2)
elif not groups is None:
gruni = np.unique(groups)
if gruni.size != 2: # pylint: disable=E1103
raise ValueError('not exactly two groups specified')
#require groups to be numeric ???
else:
raise ValueError('either y or groups is necessary')
xargsort = np.argsort(x)
#check for ties
x_sorted = x[xargsort]
x_diff = np.diff(x_sorted) # used for detecting and handling ties
if x_diff.min() == 0:
print('ties detected') #replace with warning
x_mindiff = x_diff[x_diff > 0].min()
eps = x_mindiff/2.
xx = x.copy() #don't change original, just in case
xx[groups==gruni[0]] += eps
xargsort = np.argsort(xx)
xindicator = groups[xargsort]
z0, p0 = Runs(xindicator).runs_test(correction=correction)
xx[groups==gruni[0]] -= eps #restore xx = x
xx[groups==gruni[1]] += eps
xargsort = np.argsort(xx)
xindicator = groups[xargsort]
z1, p1 = Runs(xindicator).runs_test(correction=correction)
idx = np.argmax([p0,p1])
return [z0, z1][idx], [p0, p1][idx]
else:
xindicator = groups[xargsort]
return Runs(xindicator).runs_test(correction=correction)
[docs]class TotalRunsProb(object):
'''class for the probability distribution of total runs
This is the exact probability distribution for the (Wald-Wolfowitz)
runs test. The random variable is the total number of runs if the
sample has (n0, n1) observations of groups 0 and 1.
Notes
-----
Written as a class so I can store temporary calculations, but I don't
think it matters much.
Formulas taken from SAS manual for one-sided significance level.
Could be converted to a full univariate distribution, subclassing
scipy.stats.distributions.
*Status*
Not verified yet except for mean.
'''
[docs] def __init__(self, n0, n1):
self.n0 = n0
self.n1 = n1
self.n = n = n0 + n1
self.comball = comb(n, n1)
[docs] def runs_prob_even(self, r):
n0, n1 = self.n0, self.n1
tmp0 = comb(n0-1, r//2-1)
tmp1 = comb(n1-1, r//2-1)
return tmp0 * tmp1 * 2. / self.comball
[docs] def runs_prob_odd(self, r):
n0, n1 = self.n0, self.n1
k = (r+1)//2
tmp0 = comb(n0-1, k-1)
tmp1 = comb(n1-1, k-2)
tmp3 = comb(n0-1, k-2)
tmp4 = comb(n1-1, k-1)
return (tmp0 * tmp1 + tmp3 * tmp4) / self.comball
[docs] def pdf(self, r):
r = np.asarray(r)
r_isodd = np.mod(r, 2) > 0
r_odd = r[r_isodd]
r_even = r[~r_isodd]
runs_pdf = np.zeros(r.shape)
runs_pdf[r_isodd] = self.runs_prob_odd(r_odd)
runs_pdf[~r_isodd] = self.runs_prob_even(r_even)
return runs_pdf
[docs] def cdf(self, r):
r_ = np.arange(2,r+1)
cdfval = self.runs_prob_even(r_[::2]).sum()
cdfval += self.runs_prob_odd(r_[1::2]).sum()
return cdfval
[docs]class RunsProb(object):
'''distribution of success runs of length k or more (classical definition)
The underlying process is assumed to be a sequence of Bernoulli trials
of a given length n.
not sure yet, how to interpret or use the distribution for runs
of length k or more.
Musseli also has longest success run, and waiting time distribution
negative binomial of order k and geometric of order k
need to compare with Godpole
need a MonteCarlo function to do some quick tests before doing more
'''
[docs] def pdf(self, x, k, n, p):
'''distribution of success runs of length k or more
Parameters
----------
x : float
count of runs of length n
k : int
length of runs
n : int
total number of observations or trials
p : float
probability of success in each Bernoulli trial
Returns
-------
pdf : float
probability that x runs of length of k are observed
Notes
-----
not yet vectorized
References
----------
Muselli 1996, theorem 3
'''
q = 1-p
m = np.arange(x, (n+1)//(k+1)+1)[:,None]
terms = (-1)**(m-x) * comb(m, x) * p**(m*k) * q**(m-1) \
* (comb(n - m*k, m - 1) + q * comb(n - m*k, m))
return terms.sum(0)
[docs] def pdf_nb(self, x, k, n, p):
pass
#y = np.arange(m-1, n-mk+1
'''
>>> [np.sum([RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]) for k in range(16)]
[0.99999332193894064, 0.99999999999999367, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
>>> [(np.arange(0,16) * [RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]).sum() for k in range(16)]
[6.9998931510341809, 4.1406249999999929, 2.4414062500000075, 1.4343261718749996, 0.83923339843749856, 0.48875808715820324, 0.28312206268310569, 0.1629814505577086, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
>>> np.array([(np.arange(0,16) * [RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]).sum() for k in range(16)])/11
array([ 0.63635392, 0.37642045, 0.22194602, 0.13039329, 0.07629395,
0.04443255, 0.02573837, 0.0148165 , 0. , 0. ,
0. , 0. , 0. , 0. , 0. , 0. ])
>>> np.diff([(np.arange(0,16) * [RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]).sum() for k in range(16)][::-1])
array([ 0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.16298145, 0.12014061, 0.20563602,
0.35047531, 0.59509277, 1.00708008, 1.69921875, 2.85926815])
'''
[docs]def cochrans_q(x):
'''Cochran's Q test for identical effect of k treatments
Cochran's Q is a k-sample extension of the McNemar test. If there are only
two treatments, then Cochran's Q test and McNemar test are equivalent.
Test that the probability of success is the same for each treatment.
The alternative is that at least two treatments have a different
probability of success.
Parameters
----------
x : array_like, 2d (N,k)
data with N cases and k variables
Returns
-------
q_stat : float
test statistic
pvalue : float
pvalue from the chisquare distribution
Notes
-----
In Wikipedia terminology, rows are blocks and columns are treatments.
The number of rows N, should be large for the chisquare distribution to be
a good approximation.
The Null hypothesis of the test is that all treatments have the
same effect.
References
----------
http://en.wikipedia.org/wiki/Cochran_test
SAS Manual for NPAR TESTS
'''
x = np.asarray(x)
gruni = np.unique(x)
N, k = x.shape
count_row_success = (x==gruni[-1]).sum(1, float)
count_col_success = (x==gruni[-1]).sum(0, float)
count_row_ss = count_row_success.sum()
count_col_ss = count_col_success.sum()
assert count_row_ss == count_col_ss #just a calculation check
#this is SAS manual
q_stat = (k-1) * (k * np.sum(count_col_success**2) - count_col_ss**2) \
/ (k * count_row_ss - np.sum(count_row_success**2))
#Note: the denominator looks just like k times the variance of the
#columns
#Wikipedia uses a different, but equivalent expression
## q_stat = (k-1) * (k * np.sum(count_row_success**2) - count_row_ss**2) \
## / (k * count_col_ss - np.sum(count_col_success**2))
return q_stat, stats.chi2.sf(q_stat, k-1)
[docs]def mcnemar(x, y=None, exact=True, correction=True):
'''McNemar test
Parameters
----------
x, y : array_like
two paired data samples. If y is None, then x can be a 2 by 2
contingency table. x and y can have more than one dimension, then
the results are calculated under the assumption that axis zero
contains the observation for the samples.
exact : bool
If exact is true, then the binomial distribution will be used.
If exact is false, then the chisquare distribution will be used, which
is the approximation to the distribution of the test statistic for
large sample sizes.
correction : bool
If true, then a continuity correction is used for the chisquare
distribution (if exact is false.)
Returns
-------
stat : float or int, array
The test statistic is the chisquare statistic if exact is false. If the
exact binomial distribution is used, then this contains the min(n1, n2),
where n1, n2 are cases that are zero in one sample but one in the other
sample.
pvalue : float or array
p-value of the null hypothesis of equal effects.
Notes
-----
This is a special case of Cochran's Q test. The results when the chisquare
distribution is used are identical, except for continuity correction.
'''
x = np.asarray(x)
if y is None and x.shape[0] == x.shape[1]:
if x.shape[0] != 2:
raise ValueError('table needs to be 2 by 2')
n1, n2 = x[1, 0], x[0, 1]
else:
# I'm not checking here whether x and y are binary,
# isn't this also paired sign test
n1 = np.sum(x < y, 0)
n2 = np.sum(x > y, 0)
if exact:
stat = np.minimum(n1, n2)
# binom is symmetric with p=0.5
pval = stats.binom.cdf(stat, n1 + n2, 0.5) * 2
pval = np.minimum(pval, 1) # limit to 1 if n1==n2
else:
corr = int(correction) # convert bool to 0 or 1
stat = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
df = 1
pval = stats.chi2.sf(stat, df)
return stat, pval
[docs]def symmetry_bowker(table):
'''Test for symmetry of a (k, k) square contingency table
This is an extension of the McNemar test to test the Null hypothesis
that the contingency table is symmetric around the main diagonal, that is
n_{i, j} = n_{j, i} for all i, j
Parameters
----------
table : array_like, 2d, (k, k)
a square contingency table that contains the count for k categories
in rows and columns.
Returns
-------
statistic : float
chisquare test statistic
p-value : float
p-value of the test statistic based on chisquare distribution
df : int
degrees of freedom of the chisquare distribution
Notes
-----
Implementation is based on the SAS documentation, R includes it in
`mcnemar.test` if the table is not 2 by 2.
The pvalue is based on the chisquare distribution which requires that the
sample size is not very small to be a good approximation of the true
distribution. For 2x2 contingency tables exact distribution can be
obtained with `mcnemar`
See Also
--------
mcnemar
'''
table = np.asarray(table)
k, k2 = table.shape
if k != k2:
raise ValueError('table needs to be square')
#low_idx = np.tril_indices(k, -1) # this doesn't have Fortran order
upp_idx = np.triu_indices(k, 1)
tril = table.T[upp_idx] # lower triangle in column order
triu = table[upp_idx] # upper triangle in row order
stat = ((tril - triu)**2 / (tril + triu + 1e-20)).sum()
df = k * (k-1) / 2.
pval = stats.chi2.sf(stat, df)
return stat, pval, df
if __name__ == '__main__':
x1 = np.array([1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1])
print(Runs(x1).runs_test())
print(runstest_1samp(x1, cutoff='mean'))
print(runstest_2samp(np.arange(16,0,-1), groups=x1))
print(TotalRunsProb(7,9).cdf(11))
print(median_test_ksample(np.random.randn(100), np.random.randint(0,2,100)))
print(cochrans_q(np.random.randint(0,2,(100,8))))