# This file is part of Patsy
# Copyright (C) 2013 Nathaniel Smith <njs@pobox.com>
# See file LICENSE.txt for license information.
# Missing data detection/handling
# First, how do we represent missing data? (i.e., which values count as
# "missing"?) In the long run, we want to use numpy's NA support... but that
# doesn't exist yet. Until then, people use various sorts of ad-hoc
# things. Some things that might be considered NA:
# NA (eventually)
# NaN (in float or object arrays)
# None (in object arrays)
# np.ma.masked (in numpy.ma masked arrays)
# Pandas compatibility considerations:
# For numeric arrays, None is unconditionally converted to NaN.
# For object arrays (including string arrays!), None and NaN are preserved,
# but pandas.isnull() returns True for both.
# np.ma compatibility considerations:
# Preserving array subtypes is a huge pain, because it means that we can't
# just call 'asarray' and be done... we already jump through tons of hoops
# to write code that can handle both ndarray's and pandas objects, and
# just thinking about adding another item to this list makes me tired. So
# for now we don't support np.ma missing values. Use pandas!
# Next, what should be done once we find missing data? R's options:
# -- throw away those rows (from all aligned matrices)
# -- with or without preserving information on which rows were discarded
# -- error out
# -- carry on
# The 'carry on' option requires that we have some way to represent NA in our
# output array. To avoid further solidifying the use of NaN for this purpose,
# we'll leave this option out for now, until real NA support is
# available. Also, we always preserve information on which rows were
# discarded, using the pandas index functionality (currently this is only
# returned to the original caller if they used return_type="dataframe",
# though).
import numpy as np
from patsy import PatsyError
from patsy.util import (safe_isnan, safe_scalar_isnan,
no_pickling, assert_no_pickling)
# These are made available in the patsy.* namespace
__all__ = ["NAAction"]
_valid_NA_types = ["None", "NaN"]
_valid_NA_responses = ["raise", "drop"]
def _desc_options(options):
return ", ".join([repr(opt) for opt in options])
class NAAction(object):
"""An :class:`NAAction` object defines a strategy for handling missing
data.
"NA" is short for "Not Available", and is used to refer to any value which
is somehow unmeasured or unavailable. In the long run, it is devoutly
hoped that numpy will gain first-class missing value support. Until then,
we work around this lack as best we're able.
There are two parts to this: First, we have to determine what counts as
missing data. For numerical data, the default is to treat NaN values
(e.g., ``numpy.nan``) as missing. For categorical data, the default is to
treat NaN values, and also the Python object None, as missing. (This is
consistent with how pandas does things, so if you're already using
None/NaN to mark missing data in your pandas DataFrames, you're good to
go.)
Second, we have to decide what to do with any missing data when we
encounter it. One option is to simply discard any rows which contain
missing data from our design matrices (``drop``). Another option is to
raise an error (``raise``). A third option would be to simply let the
missing values pass through into the returned design matrices. However,
this last option is not yet implemented, because of the lack of any
standard way to represent missing values in arbitrary numpy matrices;
we're hoping numpy will get this sorted out before we standardize on
anything ourselves.
You can control how patsy handles missing data through the ``NA_action=``
argument to functions like :func:`build_design_matrices` and
:func:`dmatrix`. If all you want to do is to choose between ``drop`` and
``raise`` behaviour, you can pass one of those strings as the
``NA_action=`` argument directly. If you want more fine-grained control
over how missing values are detected and handled, then you can create an
instance of this class, or your own object that implements the same
interface, and pass that as the ``NA_action=`` argument instead.
"""
[docs] def __init__(self, on_NA="drop", NA_types=["None", "NaN"]):
"""The :class:`NAAction` constructor takes the following arguments:
:arg on_NA: How to handle missing values. The default is ``"drop"``,
which removes all rows from all matrices which contain any missing
values. Also available is ``"raise"``, which raises an exception
when any missing values are encountered.
:arg NA_types: Which rules are used to identify missing values, as a
list of strings. Allowed values are:
* ``"None"``: treat the ``None`` object as missing in categorical
data.
* ``"NaN"``: treat floating point NaN values as missing in
categorical and numerical data.
.. versionadded:: 0.2.0
"""
self.on_NA = on_NA
if self.on_NA not in _valid_NA_responses:
raise ValueError("invalid on_NA action %r "
"(should be one of %s)"
% (on_NA, _desc_options(_valid_NA_responses)))
if isinstance(NA_types, str):
raise ValueError("NA_types should be a list of strings")
self.NA_types = tuple(NA_types)
for NA_type in self.NA_types:
if NA_type not in _valid_NA_types:
raise ValueError("invalid NA_type %r "
"(should be one of %s)"
% (NA_type, _desc_options(_valid_NA_types)))
[docs] def is_categorical_NA(self, obj):
"""Return True if `obj` is a categorical NA value.
Note that here `obj` is a single scalar value."""
if "NaN" in self.NA_types and safe_scalar_isnan(obj):
return True
if "None" in self.NA_types and obj is None:
return True
return False
[docs] def is_numerical_NA(self, arr):
"""Returns a 1-d mask array indicating which rows in an array of
numerical values contain at least one NA value.
Note that here `arr` is a numpy array or pandas DataFrame."""
mask = np.zeros(arr.shape, dtype=bool)
if "NaN" in self.NA_types:
mask |= np.isnan(arr)
if mask.ndim > 1:
mask = np.any(mask, axis=1)
return mask
[docs] def handle_NA(self, values, is_NAs, origins):
"""Takes a set of factor values that may have NAs, and handles them
appropriately.
:arg values: A list of `ndarray` objects representing the data.
These may be 1- or 2-dimensional, and may be of varying dtype. All
will have the same number of rows (or entries, for 1-d arrays).
:arg is_NAs: A list with the same number of entries as `values`,
containing boolean `ndarray` objects that indicate which rows
contain NAs in the corresponding entry in `values`.
:arg origins: A list with the same number of entries as
`values`, containing information on the origin of each
value. If we encounter a problem with some particular value, we use
the corresponding entry in `origins` as the origin argument when
raising a :class:`PatsyError`.
:returns: A list of new values (which may have a differing number of
rows.)
"""
assert len(values) == len(is_NAs) == len(origins)
if len(values) == 0:
return values
if self.on_NA == "raise":
return self._handle_NA_raise(values, is_NAs, origins)
elif self.on_NA == "drop":
return self._handle_NA_drop(values, is_NAs, origins)
else: # pragma: no cover
assert False
def _handle_NA_raise(self, values, is_NAs, origins):
for is_NA, origin in zip(is_NAs, origins):
if np.any(is_NA):
raise PatsyError("factor contains missing values", origin)
return values
def _handle_NA_drop(self, values, is_NAs, origins):
total_mask = np.zeros(is_NAs[0].shape[0], dtype=bool)
for is_NA in is_NAs:
total_mask |= is_NA
good_mask = ~total_mask
# "..." to handle 1- versus 2-dim indexing
return [v[good_mask, ...] for v in values]
__getstate__ = no_pickling
def test_NAAction_basic():
from nose.tools import assert_raises
assert_raises(ValueError, NAAction, on_NA="pord")
assert_raises(ValueError, NAAction, NA_types=("NaN", "asdf"))
assert_raises(ValueError, NAAction, NA_types="NaN")
assert_no_pickling(NAAction())
def test_NAAction_NA_types_numerical():
for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]:
action = NAAction(NA_types=NA_types)
for extra_shape in [(), (1,), (2,)]:
arr = np.ones((4,) + extra_shape, dtype=float)
nan_rows = [0, 2]
if arr.ndim > 1 and arr.shape[1] > 1:
arr[nan_rows, [0, 1]] = np.nan
else:
arr[nan_rows] = np.nan
exp_NA_mask = np.zeros(4, dtype=bool)
if "NaN" in NA_types:
exp_NA_mask[nan_rows] = True
got_NA_mask = action.is_numerical_NA(arr)
assert np.array_equal(got_NA_mask, exp_NA_mask)
def test_NAAction_NA_types_categorical():
for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]:
action = NAAction(NA_types=NA_types)
assert not action.is_categorical_NA("a")
assert not action.is_categorical_NA(1)
assert action.is_categorical_NA(None) == ("None" in NA_types)
assert action.is_categorical_NA(np.nan) == ("NaN" in NA_types)
def test_NAAction_drop():
action = NAAction("drop")
in_values = [np.asarray([-1, 2, -1, 4, 5]),
np.asarray([10.0, 20.0, 30.0, 40.0, 50.0]),
np.asarray([[1.0, np.nan],
[3.0, 4.0],
[10.0, 5.0],
[6.0, 7.0],
[8.0, np.nan]]),
]
is_NAs = [np.asarray([True, False, True, False, False]),
np.zeros(5, dtype=bool),
np.asarray([True, False, False, False, True]),
]
out_values = action.handle_NA(in_values, is_NAs, [None] * 3)
assert len(out_values) == 3
assert np.array_equal(out_values[0], [2, 4])
assert np.array_equal(out_values[1], [20.0, 40.0])
assert np.array_equal(out_values[2], [[3.0, 4.0], [6.0, 7.0]])
def test_NAAction_raise():
action = NAAction(on_NA="raise")
# no-NA just passes through:
in_arrs = [np.asarray([1.1, 1.2]),
np.asarray([1, 2])]
is_NAs = [np.asarray([False, False])] * 2
got_arrs = action.handle_NA(in_arrs, is_NAs, [None, None])
assert np.array_equal(got_arrs[0], in_arrs[0])
assert np.array_equal(got_arrs[1], in_arrs[1])
from patsy.origin import Origin
o1 = Origin("asdf", 0, 1)
o2 = Origin("asdf", 2, 3)
# NA raises an error with a correct origin
in_idx = np.arange(2)
in_arrs = [np.asarray([1.1, 1.2]),
np.asarray([1.0, np.nan])]
is_NAs = [np.asarray([False, False]),
np.asarray([False, True])]
try:
action.handle_NA(in_arrs, is_NAs, [o1, o2])
assert False
except PatsyError as e:
assert e.origin is o2