Source code for patsy.missing

# This file is part of Patsy
# Copyright (C) 2013 Nathaniel Smith <njs@pobox.com>
# See file LICENSE.txt for license information.

# Missing data detection/handling

# First, how do we represent missing data? (i.e., which values count as
# "missing"?) In the long run, we want to use numpy's NA support... but that
# doesn't exist yet. Until then, people use various sorts of ad-hoc
# things. Some things that might be considered NA:
#   NA (eventually)
#   NaN  (in float or object arrays)
#   None (in object arrays)
#   np.ma.masked (in numpy.ma masked arrays)
# Pandas compatibility considerations:
#   For numeric arrays, None is unconditionally converted to NaN.
#   For object arrays (including string arrays!), None and NaN are preserved,
#     but pandas.isnull() returns True for both.
# np.ma compatibility considerations:
#   Preserving array subtypes is a huge pain, because it means that we can't
#   just call 'asarray' and be done... we already jump through tons of hoops
#   to write code that can handle both ndarray's and pandas objects, and
#   just thinking about adding another item to this list makes me tired. So
#   for now we don't support np.ma missing values. Use pandas!

# Next, what should be done once we find missing data? R's options:
#   -- throw away those rows (from all aligned matrices)
#      -- with or without preserving information on which rows were discarded
#   -- error out
#   -- carry on
# The 'carry on' option requires that we have some way to represent NA in our
# output array. To avoid further solidifying the use of NaN for this purpose,
# we'll leave this option out for now, until real NA support is
# available. Also, we always preserve information on which rows were
# discarded, using the pandas index functionality (currently this is only
# returned to the original caller if they used return_type="dataframe",
# though).

import numpy as np
from patsy import PatsyError
from patsy.util import (safe_isnan, safe_scalar_isnan,
                        no_pickling, assert_no_pickling)

# These are made available in the patsy.* namespace
__all__ = ["NAAction"]

_valid_NA_types = ["None", "NaN"]
_valid_NA_responses = ["raise", "drop"]
def _desc_options(options):
    return ", ".join([repr(opt) for opt in options])

class NAAction(object):
    """An :class:`NAAction` object defines a strategy for handling missing
    data.

    "NA" is short for "Not Available", and is used to refer to any value which
    is somehow unmeasured or unavailable. In the long run, it is devoutly
    hoped that numpy will gain first-class missing value support. Until then,
    we work around this lack as best we're able.

    There are two parts to this: First, we have to determine what counts as
    missing data. For numerical data, the default is to treat NaN values
    (e.g., ``numpy.nan``) as missing. For categorical data, the default is to
    treat NaN values, and also the Python object None, as missing. (This is
    consistent with how pandas does things, so if you're already using
    None/NaN to mark missing data in your pandas DataFrames, you're good to
    go.)

    Second, we have to decide what to do with any missing data when we
    encounter it. One option is to simply discard any rows which contain
    missing data from our design matrices (``drop``). Another option is to
    raise an error (``raise``). A third option would be to simply let the
    missing values pass through into the returned design matrices. However,
    this last option is not yet implemented, because of the lack of any
    standard way to represent missing values in arbitrary numpy matrices;
    we're hoping numpy will get this sorted out before we standardize on
    anything ourselves.

    You can control how patsy handles missing data through the ``NA_action=``
    argument to functions like :func:`build_design_matrices` and
    :func:`dmatrix`. If all you want to do is to choose between ``drop`` and
    ``raise`` behaviour, you can pass one of those strings as the
    ``NA_action=`` argument directly. If you want more fine-grained control
    over how missing values are detected and handled, then you can create an
    instance of this class, or your own object that implements the same
    interface, and pass that as the ``NA_action=`` argument instead.
    """
[docs]    def __init__(self, on_NA="drop", NA_types=["None", "NaN"]):
        """The :class:`NAAction` constructor takes the following arguments:

        :arg on_NA: How to handle missing values. The default is ``"drop"``,
          which removes all rows from all matrices which contain any missing
          values. Also available is ``"raise"``, which raises an exception
          when any missing values are encountered.
        :arg NA_types: Which rules are used to identify missing values, as a
          list of strings. Allowed values are:

          * ``"None"``: treat the ``None`` object as missing in categorical
            data.
          * ``"NaN"``: treat floating point NaN values as missing in
            categorical and numerical data.

        .. versionadded:: 0.2.0
        """
        self.on_NA = on_NA
        if self.on_NA not in _valid_NA_responses:
            raise ValueError("invalid on_NA action %r "
                             "(should be one of %s)"
                             % (on_NA, _desc_options(_valid_NA_responses)))
        if isinstance(NA_types, str):
            raise ValueError("NA_types should be a list of strings")
        self.NA_types = tuple(NA_types)
        for NA_type in self.NA_types:
            if NA_type not in _valid_NA_types:
                raise ValueError("invalid NA_type %r "
                                 "(should be one of %s)"
                                 % (NA_type, _desc_options(_valid_NA_types)))

[docs]    def is_categorical_NA(self, obj):
        """Return True if `obj` is a categorical NA value.

        Note that here `obj` is a single scalar value."""
        if "NaN" in self.NA_types and safe_scalar_isnan(obj):
            return True
        if "None" in self.NA_types and obj is None:
            return True
        return False

[docs]    def is_numerical_NA(self, arr):
        """Returns a 1-d mask array indicating which rows in an array of
        numerical values contain at least one NA value.

        Note that here `arr` is a numpy array or pandas DataFrame."""
        mask = np.zeros(arr.shape, dtype=bool)
        if "NaN" in self.NA_types:
            mask |= np.isnan(arr)
        if mask.ndim > 1:
            mask = np.any(mask, axis=1)
        return mask

[docs]    def handle_NA(self, values, is_NAs, origins):
        """Takes a set of factor values that may have NAs, and handles them
        appropriately.

        :arg values: A list of `ndarray` objects representing the data.
          These may be 1- or 2-dimensional, and may be of varying dtype. All
          will have the same number of rows (or entries, for 1-d arrays).
        :arg is_NAs: A list with the same number of entries as `values`,
          containing boolean `ndarray` objects that indicate which rows
          contain NAs in the corresponding entry in `values`.
        :arg origins: A list with the same number of entries as
          `values`, containing information on the origin of each
          value. If we encounter a problem with some particular value, we use
          the corresponding entry in `origins` as the origin argument when
          raising a :class:`PatsyError`.
        :returns: A list of new values (which may have a differing number of
          rows.)
        """
        assert len(values) == len(is_NAs) == len(origins)
        if len(values) == 0:
            return values
        if self.on_NA == "raise":
            return self._handle_NA_raise(values, is_NAs, origins)
        elif self.on_NA == "drop":
            return self._handle_NA_drop(values, is_NAs, origins)
        else: # pragma: no cover
            assert False

    def _handle_NA_raise(self, values, is_NAs, origins):
        for is_NA, origin in zip(is_NAs, origins):
            if np.any(is_NA):
                raise PatsyError("factor contains missing values", origin)
        return values

    def _handle_NA_drop(self, values, is_NAs, origins):
        total_mask = np.zeros(is_NAs[0].shape[0], dtype=bool)
        for is_NA in is_NAs:
            total_mask |= is_NA
        good_mask = ~total_mask
        # "..." to handle 1- versus 2-dim indexing
        return [v[good_mask, ...] for v in values]

    __getstate__ = no_pickling

def test_NAAction_basic():
    from nose.tools import assert_raises
    assert_raises(ValueError, NAAction, on_NA="pord")
    assert_raises(ValueError, NAAction, NA_types=("NaN", "asdf"))
    assert_raises(ValueError, NAAction, NA_types="NaN")

    assert_no_pickling(NAAction())

def test_NAAction_NA_types_numerical():
    for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]:
        action = NAAction(NA_types=NA_types)
        for extra_shape in [(), (1,), (2,)]:
            arr = np.ones((4,) + extra_shape, dtype=float)
            nan_rows = [0, 2]
            if arr.ndim > 1 and arr.shape[1] > 1:
                arr[nan_rows, [0, 1]] = np.nan
            else:
                arr[nan_rows] = np.nan
            exp_NA_mask = np.zeros(4, dtype=bool)
            if "NaN" in NA_types:
                exp_NA_mask[nan_rows] = True
            got_NA_mask = action.is_numerical_NA(arr)
            assert np.array_equal(got_NA_mask, exp_NA_mask)

def test_NAAction_NA_types_categorical():
    for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]:
        action = NAAction(NA_types=NA_types)
        assert not action.is_categorical_NA("a")
        assert not action.is_categorical_NA(1)
        assert action.is_categorical_NA(None) == ("None" in NA_types)
        assert action.is_categorical_NA(np.nan) == ("NaN" in NA_types)

def test_NAAction_drop():
    action = NAAction("drop")
    in_values = [np.asarray([-1, 2, -1, 4, 5]),
                 np.asarray([10.0, 20.0, 30.0, 40.0, 50.0]),
                 np.asarray([[1.0, np.nan],
                             [3.0, 4.0],
                             [10.0, 5.0],
                             [6.0, 7.0],
                             [8.0, np.nan]]),
                 ]
    is_NAs = [np.asarray([True, False, True, False, False]),
              np.zeros(5, dtype=bool),
              np.asarray([True, False, False, False, True]),
              ]
    out_values = action.handle_NA(in_values, is_NAs, [None] * 3)
    assert len(out_values) == 3
    assert np.array_equal(out_values[0], [2, 4])
    assert np.array_equal(out_values[1], [20.0, 40.0])
    assert np.array_equal(out_values[2], [[3.0, 4.0], [6.0, 7.0]])
    
def test_NAAction_raise():
    action = NAAction(on_NA="raise")

    # no-NA just passes through:
    in_arrs = [np.asarray([1.1, 1.2]),
               np.asarray([1, 2])]
    is_NAs = [np.asarray([False, False])] * 2
    got_arrs = action.handle_NA(in_arrs, is_NAs, [None, None])
    assert np.array_equal(got_arrs[0], in_arrs[0])
    assert np.array_equal(got_arrs[1], in_arrs[1])

    from patsy.origin import Origin
    o1 = Origin("asdf", 0, 1)
    o2 = Origin("asdf", 2, 3)

    # NA raises an error with a correct origin
    in_idx = np.arange(2)
    in_arrs = [np.asarray([1.1, 1.2]),
               np.asarray([1.0, np.nan])]
    is_NAs = [np.asarray([False, False]),
              np.asarray([False, True])]
    try:
        action.handle_NA(in_arrs, is_NAs, [o1, o2])
        assert False
    except PatsyError as e:
        assert e.origin is o2