Source code for statsmodels.sandbox.formula
"""
Provides the basic classes needed to specify statistical models.
namespace : dictionary
mapping from names to data, used to associate data to a formula or term
"""
from statsmodels.compat.python import (iterkeys, lrange, callable, string_types,
itervalues, range)
import copy
import types
import numpy as np
__docformat__ = 'restructuredtext'
default_namespace = {}
[docs]class Term(object):
"""
This class is very simple: it is just a named term in a model formula.
It is also callable: by default it namespace[self.name], where namespace
defaults to formula.default_namespace.
When called in an instance of formula,
the namespace used is that formula's namespace.
Inheritance of the namespace under +,*,- operations:
----------------------------------------------------
By default, the namespace is empty, which means it must be
specified before evaluating the design matrix.
When it is unambiguous, the namespaces of objects are derived from the
context.
Rules:
------
i) "X * I", "X + I", "X**i": these inherit X's namespace
ii) "F.main_effect()": this inherits the Factor F's namespace
iii) "A-B": this inherits A's namespace
iv) if A.namespace == B.namespace, then A+B inherits this namespace
v) if A.namespace == B.namespace, then A*B inherits this namespace
Equality of namespaces:
-----------------------
This is done by comparing the namespaces directly, if
an exception is raised in the check of equality, they are
assumed not to be equal.
"""
def __pow__(self, power):
"""
Raise the quantitative term's values to an integer power, i.e.
polynomial.
"""
try:
power = float(power)
except:
raise ValueError('expecting a float')
if power == int(power):
name = '%s^%d' % (self.name, int(power))
else:
name = '%s^%0.2f' % (self.name, power)
value = Quantitative(name, func=self, transform=lambda x: np.power(x, power))
value.power = power
value.namespace = self.namespace
return value
[docs] def __init__(self, name, func=None, termname=None):
self.name = name
self.__namespace = None
if termname is None:
self.termname = name
else:
self.termname = termname
if not isinstance(self.termname, string_types):
raise ValueError('expecting a string for termname')
if func:
self.func = func
# Namespace in which self.name will be looked up in, if needed
def _get_namespace(self):
if isinstance(self.__namespace, np.ndarray):
return self.__namespace
else: return self.__namespace or default_namespace
def _set_namespace(self, value): self.__namespace = value
def _del_namespace(self): del self.__namespace
namespace = property(_get_namespace, _set_namespace, _del_namespace)
def __str__(self):
"""
'<term: %s>' % self.termname
"""
return '<term: %s>' % self.termname
def __add__(self, other):
"""
Formula(self) + Formula(other)
"""
fother = Formula(other, namespace=other.namespace)
f = fother + self
if _namespace_equal(fother.namespace, self.namespace):
f.namespace = self.namespace
return f
def __mul__(self, other):
"""
Formula(self) * Formula(other)
"""
if isinstance(other, Term) and other.name is 'intercept':
f = Formula(self, namespace=self.namespace)
elif self.name is 'intercept':
f = Formula(other, namespace=other.namespace)
else:
other = Formula(other, namespace=other.namespace)
f = other * self
if _namespace_equal(other.namespace, self.namespace):
f.namespace = self.namespace
return f
[docs] def names(self):
"""
Return the names of the columns in design associated to the terms,
i.e. len(self.names()) = self().shape[0].
"""
if isinstance(self.name, string_types):
return [self.name]
else:
return list(self.name)
def __call__(self, *args, **kw):
"""
Return the columns associated to self in a design matrix.
If the term has no 'func' attribute, it returns
``self.namespace[self.termname]``
else, it returns
``self.func(*args, **kw)``
"""
if not hasattr(self, 'func'):
val = self.namespace[self.termname]
else:
val = self.func
if callable(val):
if isinstance(val, (Term, Formula)):
val = copy.copy(val)
val.namespace = self.namespace
val = val(*args, **kw)
val = np.asarray(val)
return np.squeeze(val)
[docs]class Factor(Term):
"""A categorical factor."""
[docs] def __init__(self, termname, keys, ordinal=False):
"""
Factor is initialized with keys, representing all valid
levels of the factor.
If ordinal is False, keys can have repeats: set(keys) is what is
used.
If ordinal is True, the order is taken from the keys, and
there should be no repeats.
"""
if not ordinal:
self.keys = list(set(keys))
self.keys.sort()
else:
self.keys = keys
if len(set(keys)) != len(list(keys)):
raise ValueError('keys for ordinal Factor should be unique, in increasing order')
self._name = termname
self.termname = termname
self.ordinal = ordinal
if self.ordinal:
name = self.termname
else:
name = ['(%s==%s)' % (self.termname, str(key)) for key in self.keys]
Term.__init__(self, name, termname=self.termname, func=self.get_columns)
[docs] def get_columns(self, *args, **kw):
"""
Calling function for factor instance.
"""
v = self.namespace[self._name]
while True:
if callable(v):
if isinstance(v, (Term, Formula)):
v = copy.copy(v)
v.namespace = self.namespace
v = v(*args, **kw)
else: break
n = len(v)
if self.ordinal:
col = [float(self.keys.index(v[i])) for i in range(n)]
return np.array(col)
else:
value = []
for key in self.keys:
col = [float((v[i] == key)) for i in range(n)]
value.append(col)
return np.array(value)
[docs] def values(self, *args, **kw):
"""
Return the keys of the factor, rather than the columns of the design
matrix.
"""
del(self.func)
val = self(*args, **kw)
self.func = self.get_columns
return val
[docs] def verify(self, values):
"""
Verify that all values correspond to valid keys in self.
"""
s = set(values)
if not s.issubset(self.keys):
raise ValueError('unknown keys in values')
def __add__(self, other):
"""
Formula(self) + Formula(other)
When adding \'intercept\' to a factor, this just returns
Formula(self, namespace=self.namespace)
"""
if isinstance(other, Term) and other.name is 'intercept':
return Formula(self, namespace=self.namespace)
else:
return Term.__add__(self, other)
[docs] def main_effect(self, reference=None):
"""
Return the 'main effect' columns of a factor, choosing
an optional reference key.
The reference key can be one of the keys of the Factor,
or an integer, representing which column to remove.
It defaults to 0.
"""
names = self.names()
if reference is None:
reference = 0
else:
try:
reference = self.keys.index(reference)
except ValueError:
reference = int(reference)
def maineffect_func(value, reference=reference):
rvalue = []
keep = lrange(value.shape[0])
keep.pop(reference)
for i in range(len(keep)):
rvalue.append(value[keep[i]] - value[reference])
return np.array(rvalue)
keep = lrange(len(self.names()))
keep.pop(reference)
__names = self.names()
_names = ['%s-%s' % (__names[keep[i]], __names[reference]) for i in range(len(keep))]
value = Quantitative(_names, func=self,
termname='%s:maineffect' % self.termname,
transform=maineffect_func)
value.namespace = self.namespace
return value
def __getitem__(self, key):
"""
Retrieve the column corresponding to key in a Formula.
:Parameters:
key : one of the Factor's keys
:Returns: ndarray corresponding to key, when evaluated in
current namespace
"""
if not self.ordinal:
i = self.names().index('(%s==%s)' % (self.termname, str(key)))
return self()[i]
else:
v = self.namespace[self._name]
return np.array([(vv == key) for vv in v]).astype(np.float)
[docs]class Quantitative(Term):
"""
A subclass of term that can be used to apply point transformations
of another term, i.e. to take powers:
>>> import numpy as np
>>> from nipy.fixes.scipy.stats.models import formula
>>> X = np.linspace(0,10,101)
>>> x = formula.Term('X')
>>> x.namespace={'X':X}
>>> x2 = x**2
>>> print np.allclose(x()**2, x2())
True
>>> x3 = formula.Quantitative('x2', func=x, transform=lambda x: x**2)
>>> x3.namespace = x.namespace
>>> print np.allclose(x()**2, x3())
True
"""
[docs] def __init__(self, name, func=None, termname=None, transform=lambda x: x):
self.transform = transform
Term.__init__(self, name, func=func, termname=termname)
def __call__(self, *args, **kw):
"""
A quantitative is just like term, except there is an additional
transformation: self.transform.
"""
return self.transform(Term.__call__(self, *args, **kw))
[docs]class Formula(object):
"""
A formula object for manipulating design matrices in regression models,
essentially consisting of a list of term instances.
The object supports addition and multiplication which correspond
to concatenation and pairwise multiplication, respectively,
of the columns of the two formulas.
"""
def _get_namespace(self):
if isinstance(self.__namespace, np.ndarray):
return self.__namespace
else: return self.__namespace or default_namespace
def _set_namespace(self, value): self.__namespace = value
def _del_namespace(self): del self.__namespace
namespace = property(_get_namespace, _set_namespace, _del_namespace)
def _terms_changed(self):
self._names = self.names()
self._termnames = self.termnames()
[docs] def __init__(self, termlist, namespace=default_namespace):
"""
Create a formula from either:
i. a `formula` object
ii. a sequence of `term` instances
iii. one `term`
"""
self.__namespace = namespace
if isinstance(termlist, Formula):
self.terms = copy.copy(list(termlist.terms))
elif isinstance(termlist, list):
self.terms = termlist
elif isinstance(termlist, Term):
self.terms = [termlist]
else:
raise ValueError
self._terms_changed()
def __str__(self):
"""
String representation of list of termnames of a formula.
"""
value = []
for term in self.terms:
value += [term.termname]
return '<formula: %s>' % ' + '.join(value)
def __call__(self, *args, **kw):
"""
Create (transpose) of the design matrix of the formula within
namespace. Extra arguments are passed to each term instance. If
the formula just contains an intercept, then the keyword
argument 'nrow' indicates the number of rows (observations).
"""
if 'namespace' in kw:
namespace = kw['namespace']
else:
namespace = self.namespace
allvals = []
intercept = False
iindex = 0
for t in self.terms:
t = copy.copy(t)
t.namespace = namespace
val = t(*args, **kw)
isintercept = False
if hasattr(t, "termname"):
if t.termname == 'intercept':
intercept = True
isintercept = True
interceptindex = iindex
allvals.append(None)
if val.ndim == 1 and not isintercept:
val.shape = (1, val.shape[0])
allvals.append(val)
elif not isintercept:
allvals.append(val)
iindex += 1
if not intercept:
try:
allvals = np.concatenate(allvals)
except:
pass
else:
nrow = kw.get('nrow', -1)
if allvals != []:
if interceptindex > 0:
n = allvals[0].shape[1]
else:
n = allvals[1].shape[1]
allvals[interceptindex] = np.ones((1,n), np.float64)
allvals = np.concatenate(allvals)
elif nrow <= 1:
raise ValueError('with only intercept in formula, keyword \'nrow\' argument needed')
else:
allvals = I(nrow=nrow)
allvals.shape = (1,) + allvals.shape
return np.squeeze(allvals)
[docs] def hasterm(self, query_term):
"""
Determine whether a given term is in a formula.
"""
if not isinstance(query_term, Formula):
if isinstance(query_term, string_types):
try:
query = self[query_term]
return query.termname in self.termnames()
except:
return False
elif isinstance(query_term, Term):
return query_term.termname in self.termnames()
elif len(query_term.terms) == 1:
query_term = query_term.terms[0]
return query_term.termname in self.termnames()
else:
raise ValueError('more than one term passed to hasterm')
def __getitem__(self, name):
t = self.termnames()
if name in t:
return self.terms[t.index(name)]
else:
raise KeyError('formula has no such term: %s' % repr(name))
[docs] def termcolumns(self, query_term, dict=False):
"""
Return a list of the indices of all columns associated
to a given term.
"""
if self.hasterm(query_term):
names = query_term.names()
value = {}
for name in names:
value[name] = self._names.index(name)
else:
raise ValueError('term not in formula')
if dict:
return value
else:
return list(itervalues(value))
[docs] def names(self):
"""
Return a list of the names in the formula. The order of the
names corresponds to the order of the columns when self
is evaluated.
"""
allnames = []
for term in self.terms:
allnames += term.names()
return allnames
[docs] def termnames(self):
"""
Return a list of the term names in the formula. These
are the names of each term instance in self.
"""
names = []
for term in self.terms:
names += [term.termname]
return names
[docs] def design(self, *args, **kw):
"""
``transpose(self(*args, **kw))``
"""
return self(*args, **kw).T
def __mul__(self, other, nested=False):
"""
This returns a formula whose columns are the pairwise
product of the columns of self and other.
TO DO: check for nesting relationship. Should not be too difficult.
"""
other = Formula(other)
selftermnames = self.termnames()
othertermnames = other.termnames()
I = len(selftermnames)
J = len(othertermnames)
terms = []
termnames = []
for i in range(I):
for j in range(J):
termname = '%s*%s' % (str(selftermnames[i]), str(othertermnames[j]))
pieces = sorted(termname.split('*'))
termname = '*'.join(pieces)
termnames.append(termname)
selfnames = self.terms[i].names()
othernames = other.terms[j].names()
if self.terms[i].name is 'intercept':
_term = other.terms[j]
_term.namespace = other.namespace
elif other.terms[j].name is 'intercept':
_term = self.terms[i]
_term.namespace = self.namespace
else:
names = []
d1 = len(selfnames)
d2 = len(othernames)
for r in range(d1):
for s in range(d2):
name = '%s*%s' % (str(selfnames[r]), str(othernames[s]))
pieces = sorted(name.split('*'))
name = '*'.join(pieces)
names.append(name)
def product_func(value, d1=d1, d2=d2):
out = []
for r in range(d1):
for s in range(d2):
out.append(value[r] * value[d1+s])
return np.array(out)
cself = copy.copy(self.terms[i])
cother = copy.copy(other.terms[j])
sumterms = cself + cother
sumterms.terms = [cself, cother] # enforce the order we want
_term = Quantitative(names, func=sumterms,
termname=termname,
transform=product_func)
if _namespace_equal(self.namespace, other.namespace):
_term.namespace = self.namespace
terms.append(_term)
return Formula(terms)
def __add__(self, other):
"""
Return a formula whose columns are the
concatenation of the columns of self and other.
terms in the formula are sorted alphabetically.
"""
other = Formula(other)
terms = self.terms + other.terms
pieces = sorted([(term.name, term) for term in terms])
terms = [piece[1] for piece in pieces]
f = Formula(terms)
if _namespace_equal(self.namespace, other.namespace):
f.namespace = self.namespace
return f
def __sub__(self, other):
"""
Return a formula with all terms in other removed from self.
If other contains term instances not in formula, this
function does not raise an exception.
"""
other = Formula(other)
terms = copy.copy(self.terms)
for term in other.terms:
for i in range(len(terms)):
if terms[i].termname == term.termname:
terms.pop(i)
break
f = Formula(terms)
f.namespace = self.namespace
return f
[docs]def isnested(A, B, namespace=None):
"""
Is factor B nested within factor A or vice versa: a very crude test
which depends on the namespace.
If they are nested, returns (True, F) where F is the finest
level of the relationship. Otherwise, returns (False, None)
"""
if namespace is not None:
A = copy.copy(A); A.namespace = namespace
B = copy.copy(B); B.namespace = namespace
a = A(values=True)[0]
b = B(values=True)[0]
if len(a) != len(b):
raise ValueError('A() and B() should be sequences of the same length')
nA = len(set(a))
nB = len(set(b))
n = max(nA, nB)
AB = [(a[i],b[i]) for i in range(len(a))]
nAB = len(set(AB))
if nAB == n:
if nA > nB:
F = A
else:
F = B
return (True, F)
else:
return (False, None)
def _intercept_fn(nrow=1, **extra):
return np.ones((1,nrow))
I = Term('intercept', func=_intercept_fn)
I.__doc__ = """
Intercept term in a formula. If intercept is the
only term in the formula, then a keyword argument
\'nrow\' is needed.
>>> from nipy.fixes.scipy.stats.models.formula import Formula, I
>>> I()
array(1.0)
>>> I(nrow=5)
array([ 1., 1., 1., 1., 1.])
>>> f=Formula(I)
>>> f(nrow=5)
array([1, 1, 1, 1, 1])
"""
[docs]def interactions(terms, order=[1,2]):
"""
Output all pairwise interactions of given order of a
sequence of terms.
The argument order is a sequence specifying which order
of interactions should be generated -- the default
creates main effects and two-way interactions. If order
is an integer, it is changed to range(1,order+1), so
order=3 is equivalent to order=[1,2,3], generating
all one, two and three-way interactions.
If any entry of order is greater than len(terms), it is
effectively treated as len(terms).
>>> print interactions([Term(l) for l in ['a', 'b', 'c']])
<formula: a*b + a*c + b*c + a + b + c>
>>>
>>> print interactions([Term(l) for l in ['a', 'b', 'c']], order=list(range(5)))
<formula: a*b + a*b*c + a*c + b*c + a + b + c>
>>>
"""
l = len(terms)
values = {}
if np.asarray(order).shape == ():
order = lrange(1, int(order)+1)
# First order
for o in order:
I = np.indices((l,)*(o))
I.shape = (I.shape[0], np.product(I.shape[1:]))
for m in range(I.shape[1]):
# only keep combinations that have unique entries
if (np.unique(I[:,m]).shape == I[:,m].shape and
np.alltrue(np.equal(np.sort(I[:,m]), I[:,m]))):
ll = [terms[j] for j in I[:,m]]
v = ll[0]
for ii in range(len(ll)-1):
v *= ll[ii+1]
values[tuple(I[:,m])] = v
key = list(iterkeys(values))[0]
value = values[key]; del(values[key])
for v in itervalues(values):
value += v
return value
def _namespace_equal(space1, space2):
return space1 is space2