Source code for pandas.core.categorical

# pylint: disable=E1101,W0232

import numpy as np
from warnings import warn
import types

from pandas import compat, lib
from pandas.compat import u

from pandas.types.generic import ABCSeries, ABCIndexClass, ABCCategoricalIndex
from pandas.types.missing import isnull, notnull
from pandas.types.cast import (_possibly_infer_to_datetimelike,
                               _coerce_indexer_dtype)
from pandas.types.dtypes import CategoricalDtype
from pandas.types.common import (_ensure_int64,
                                 _ensure_object,
                                 _ensure_platform_int,
                                 is_dtype_equal,
                                 is_datetimelike,
                                 is_categorical_dtype,
                                 is_integer_dtype, is_bool,
                                 is_list_like, is_sequence,
                                 is_scalar)
from pandas.core.common import is_null_slice

from pandas.core.algorithms import factorize, take_1d
from pandas.core.base import (PandasObject, PandasDelegate,
                              NoNewAttributesMixin, _shared_docs)
import pandas.core.common as com
from pandas.core.missing import interpolate_2d
from pandas.compat.numpy import function as nv
from pandas.util.decorators import (Appender, cache_readonly,
                                    deprecate_kwarg, Substitution)

from pandas.util.terminal import get_terminal_size
from pandas.core.config import get_option


def _cat_compare_op(op):
    def f(self, other):
        # On python2, you can usually compare any type to any type, and
        # Categoricals can be seen as a custom type, but having different
        # results depending whether categories are the same or not is kind of
        # insane, so be a bit stricter here and use the python3 idea of
        # comparing only things of equal type.
        if not self.ordered:
            if op in ['__lt__', '__gt__', '__le__', '__ge__']:
                raise TypeError("Unordered Categoricals can only compare "
                                "equality or not")
        if isinstance(other, Categorical):
            # Two Categoricals can only be be compared if the categories are
            # the same
            if ((len(self.categories) != len(other.categories)) or
                    not ((self.categories == other.categories).all())):
                raise TypeError("Categoricals can only be compared if "
                                "'categories' are the same")
            if not (self.ordered == other.ordered):
                raise TypeError("Categoricals can only be compared if "
                                "'ordered' is the same")
            na_mask = (self._codes == -1) | (other._codes == -1)
            f = getattr(self._codes, op)
            ret = f(other._codes)
            if na_mask.any():
                # In other series, the leads to False, so do that here too
                ret[na_mask] = False
            return ret

        # Numpy-1.9 and earlier may convert a scalar to a zerodim array during
        # comparison operation when second arg has higher priority, e.g.
        #
        #     cat[0] < cat
        #
        # With cat[0], for example, being ``np.int64(1)`` by the time it gets
        # into this function would become ``np.array(1)``.
        other = lib.item_from_zerodim(other)
        if is_scalar(other):
            if other in self.categories:
                i = self.categories.get_loc(other)
                return getattr(self._codes, op)(i)
            else:
                if op == '__eq__':
                    return np.repeat(False, len(self))
                elif op == '__ne__':
                    return np.repeat(True, len(self))
                else:
                    msg = ("Cannot compare a Categorical for op {op} with a "
                           "scalar, which is not a category.")
                    raise TypeError(msg.format(op=op))
        else:

            # allow categorical vs object dtype array comparisons for equality
            # these are only positional comparisons
            if op in ['__eq__', '__ne__']:
                return getattr(np.array(self), op)(np.array(other))

            msg = ("Cannot compare a Categorical for op {op} with type {typ}."
                   "\nIf you want to compare values, use 'np.asarray(cat) "
                   "<op> other'.")
            raise TypeError(msg.format(op=op, typ=type(other)))

    f.__name__ = op

    return f


def maybe_to_categorical(array):
    """ coerce to a categorical if a series is given """
    if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
        return array._values
    return array


_codes_doc = """The category codes of this categorical.

Level codes are an array if integer which are the positions of the real
values in the categories array.

There is not setter, use the other categorical methods and the normal item
setter to change values in the categorical.
"""

_categories_doc = """The categories of this categorical.

Setting assigns new values to each category (effectively a rename of
each individual category).

The assigned value has to be a list-like object. All items must be unique and
the number of items in the new categories must be the same as the number of
items in the old categories.

Assigning to `categories` is a inplace operation!

Raises
------
ValueError
    If the new categories do not validate as categories or if the number of new
    categories is unequal the number of old categories

See also
--------
rename_categories
reorder_categories
add_categories
remove_categories
remove_unused_categories
set_categories
"""


[docs]class Categorical(PandasObject): """ Represents a categorical variable in classic R / S-plus fashion `Categoricals` can only take on only a limited, and usually fixed, number of possible values (`categories`). In contrast to statistical categorical variables, a `Categorical` might have an order, but numerical operations (additions, divisions, ...) are not possible. All values of the `Categorical` are either in `categories` or `np.nan`. Assigning values outside of `categories` will raise a `ValueError`. Order is defined by the order of the `categories`, not lexical order of the values. Parameters ---------- values : list-like The values of the categorical. If categories are given, values not in categories will be replaced with NaN. categories : Index-like (unique), optional The unique categories for this categorical. If not given, the categories are assumed to be the unique values of values. ordered : boolean, (default False) Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will not be ordered. Attributes ---------- categories : Index The categories of this categorical codes : ndarray The codes (integer positions, which point to the categories) of this categorical, read only. ordered : boolean Whether or not this Categorical is ordered. Raises ------ ValueError If the categories do not validate. TypeError If an explicit ``ordered=True`` is given but no `categories` and the `values` are not sortable. Examples -------- >>> from pandas import Categorical >>> Categorical([1, 2, 3, 1, 2, 3]) [1, 2, 3, 1, 2, 3] Categories (3, int64): [1 < 2 < 3] >>> Categorical(['a', 'b', 'c', 'a', 'b', 'c']) [a, b, c, a, b, c] Categories (3, object): [a < b < c] >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a'], ordered=True) >>> a.min() 'c' """ dtype = CategoricalDtype() """The dtype (always "category")""" """Whether or not this Categorical is ordered. Only ordered `Categoricals` can be sorted (according to the order of the categories) and have a min and max value. See also -------- Categorical.sort Categorical.order Categorical.min Categorical.max """ # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 _typ = 'categorical' def __init__(self, values, categories=None, ordered=False, name=None, fastpath=False): if fastpath: # fast path self._codes = _coerce_indexer_dtype(values, categories) self._categories = self._validate_categories( categories, fastpath=isinstance(categories, ABCIndexClass)) self._ordered = ordered return if name is not None: msg = ("the 'name' keyword is removed, use 'name' with consumers " "of the categorical instead (e.g. 'Series(cat, " "name=\"something\")'") warn(msg, UserWarning, stacklevel=2) # sanitize input if is_categorical_dtype(values): # we are either a Series or a CategoricalIndex if isinstance(values, (ABCSeries, ABCCategoricalIndex)): values = values._values if ordered is None: ordered = values.ordered if categories is None: categories = values.categories values = values.__array__() elif isinstance(values, (ABCIndexClass, ABCSeries)): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by # _sanitize_array which is fine, but since factorize does this # correctly no need here this is an issue because _sanitize_array # also coerces np.nan to a string under certain versions of numpy # as well values = _possibly_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use # "object" dtype to prevent this. In the end objects will be # casted to int/... in the category assignment step. dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if categories is None: try: codes, categories = factorize(values, sort=True) except TypeError: codes, categories = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories raise TypeError("'values' is not ordered, please " "explicitly specify the categories order " "by passing in a categories argument.") except ValueError: # FIXME raise NotImplementedError("> 1 ndim Categorical are not " "supported at this time") categories = self._validate_categories(categories) else: # there were two ways if categories are present # - the old one, where each value is a int pointer to the levels # array -> not anymore possible, but code outside of pandas could # call us like that, so make some checks # - the new one, where each value is also in the categories array # (or np.nan) # make sure that we always have the same type here, no matter what # we get passed in categories = self._validate_categories(categories) codes = _get_codes_for_values(values, categories) # TODO: check for old style usage. These warnings should be removes # after 0.18/ in 2016 if is_integer_dtype(values) and not is_integer_dtype(categories): warn("Values and categories have different dtypes. Did you " "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) if (len(values) and is_integer_dtype(values) and (codes == -1).all()): warn("None of the categories were found in values. Did you " "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) self.set_ordered(ordered or False, inplace=True) self._categories = categories self._codes = _coerce_indexer_dtype(codes, categories) def copy(self): """ Copy constructor. """ return Categorical(values=self._codes.copy(), categories=self.categories, ordered=self.ordered, fastpath=True) def astype(self, dtype, copy=True): """ Coerce this type to another dtype Parameters ---------- dtype : numpy dtype or pandas type copy : bool, default True By default, astype always returns a newly allocated object. If copy is set to False and dtype is categorical, the original object is returned. .. versionadded:: 0.19.0 """ if is_categorical_dtype(dtype): if copy is True: return self.copy() return self return np.array(self, dtype=dtype, copy=copy) @cache_readonly def ndim(self): """Number of dimensions of the Categorical """ return self._codes.ndim @cache_readonly def size(self): """ return the len of myself """ return len(self) @cache_readonly def itemsize(self): """ return the size of a single category """ return self.categories.itemsize def reshape(self, new_shape, *args, **kwargs): """ DEPRECATED: calling this method will raise an error in a future release. An ndarray-compatible method that returns `self` because `Categorical` instances cannot actually be reshaped. Parameters ---------- new_shape : int or tuple of ints A 1-D array of integers that correspond to the new shape of the `Categorical`. For more information on the parameter, please refer to `np.reshape`. """ warn("reshape is deprecated and will raise " "in a subsequent release", FutureWarning, stacklevel=2) nv.validate_reshape(args, kwargs) # while the 'new_shape' parameter has no effect, # we should still enforce valid shape parameters np.reshape(self.codes, new_shape) return self @property def base(self): """ compat, we are always our own object """ return None @classmethod def from_array(cls, data, **kwargs): """ Make a Categorical type from a single array-like object. For internal compatibility with numpy arrays. Parameters ---------- data : array-like Can be an Index or array-like. The categories are assumed to be the unique values of `data`. """ return Categorical(data, **kwargs) @classmethod
[docs] def from_codes(cls, codes, categories, ordered=False, name=None): """ Make a Categorical type from codes and categories arrays. This constructor is useful if you already have codes and categories and so do not need the (computation intensive) factorization step, which is usually done on the constructor. If your data does not follow this convention, please use the normal constructor. Parameters ---------- codes : array-like, integers An integer array, where each integer points to a category in categories or -1 for NaN categories : index-like The categories for the categorical. Items need to be unique. ordered : boolean, (default False) Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will be unordered. """ if name is not None: msg = ("the 'name' keyword is removed, use 'name' with consumers " "of the categorical instead (e.g. 'Series(cat, " "name=\"something\")'") warn(msg, UserWarning, stacklevel=2) try: codes = np.asarray(codes, np.int64) except: raise ValueError( "codes need to be convertible to an arrays of integers") categories = cls._validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " "len(categories)-1") return Categorical(codes, categories=categories, ordered=ordered, fastpath=True)
_codes = None def _get_codes(self): """ Get the codes. Returns ------- codes : integer array view A non writable view of the `codes` array. """ v = self._codes.view() v.flags.writeable = False return v def _set_codes(self, codes): """ Not settable by the user directly """ raise ValueError("cannot set Categorical codes directly") codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) def _get_labels(self): """ Get the category labels (deprecated). Deprecated, use .codes! """ warn("'labels' is deprecated. Use 'codes' instead", FutureWarning, stacklevel=2) return self.codes labels = property(fget=_get_labels, fset=_set_codes) _categories = None @classmethod def _validate_categories(cls, categories, fastpath=False): """ Validates that we have good categories Parameters ---------- fastpath : boolean (default: False) Don't perform validation of the categories for uniqueness or nulls """ if not isinstance(categories, ABCIndexClass): dtype = None if not hasattr(categories, "dtype"): categories = _convert_to_list_like(categories) # On categories with NaNs, int values would be converted to # float. Use "object" dtype to prevent this. if isnull(categories).any(): without_na = np.array([x for x in categories if notnull(x)]) with_na = np.array(categories) if with_na.dtype != without_na.dtype: dtype = "object" from pandas import Index categories = Index(categories, dtype=dtype) if not fastpath: # check properties of the categories # we don't allow NaNs in the categories themselves if categories.hasnans: # NaNs in cats deprecated in 0.17, # remove in 0.18 or 0.19 GH 10748 msg = ('\nSetting NaNs in `categories` is deprecated and ' 'will be removed in a future version of pandas.') warn(msg, FutureWarning, stacklevel=3) # categories must be unique if not categories.is_unique: raise ValueError('Categorical categories must be unique') return categories def _set_categories(self, categories, fastpath=False): """ Sets new categories Parameters ---------- fastpath : boolean (default: False) Don't perform validation of the categories for uniqueness or nulls """ categories = self._validate_categories(categories, fastpath=fastpath) if (not fastpath and self._categories is not None and len(categories) != len(self._categories)): raise ValueError("new categories need to have the same number of " "items than the old categories!") self._categories = categories def _get_categories(self): """ Gets the categories """ # categories is an Index, which is immutable -> no need to copy return self._categories categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) _ordered = None def set_ordered(self, value, inplace=False): """ Sets the ordered attribute to the boolean value Parameters ---------- value : boolean to set whether this categorical is ordered (True) or not (False) inplace : boolean (default: False) Whether or not to set the ordered attribute inplace or return a copy of this categorical with ordered set to the value """ if not is_bool(value): raise TypeError("ordered must be a boolean value") cat = self if inplace else self.copy() cat._ordered = value if not inplace: return cat def as_ordered(self, inplace=False): """ Sets the Categorical to be ordered Parameters ---------- inplace : boolean (default: False) Whether or not to set the ordered attribute inplace or return a copy of this categorical with ordered set to True """ return self.set_ordered(True, inplace=inplace) def as_unordered(self, inplace=False): """ Sets the Categorical to be unordered Parameters ---------- inplace : boolean (default: False) Whether or not to set the ordered attribute inplace or return a copy of this categorical with ordered set to False """ return self.set_ordered(False, inplace=inplace) def _get_ordered(self): """ Gets the ordered attribute """ return self._ordered ordered = property(fget=_get_ordered) def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Sets the categories to the specified new_categories. `new_categories` can include new categories (which will result in unused categories) or remove old categories (which results in values set to NaN). If `rename==True`, the categories will simple be renamed (less or more items than in old categories will result in values set to NaN or in unused categories respectively). This method can be used to perform more than one action of adding, removing, and reordering simultaneously and is therefore faster than performing the individual steps via the more specialised methods. On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes, for example when using special string dtypes on python3, which does not considers a S1 string equal to a single char python string. Raises ------ ValueError If new_categories does not validate as categories Parameters ---------- new_categories : Index-like The categories in new order. ordered : boolean, (default: False) Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. rename : boolean (default: False) Whether or not the new_categories should be considered as a rename of the old categories or as reordered categories. inplace : boolean (default: False) Whether or not to reorder the categories inplace or return a copy of this categorical with reordered categories. Returns ------- cat : Categorical with reordered categories or None if inplace. See also -------- rename_categories reorder_categories add_categories remove_categories remove_unused_categories """ new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() if rename: if (cat._categories is not None and len(new_categories) < len(cat._categories)): # remove all _codes which are larger and set to -1/NaN self._codes[self._codes >= len(new_categories)] = -1 else: values = cat.__array__() cat._codes = _get_codes_for_values(values, new_categories) cat._categories = new_categories if ordered is None: ordered = self.ordered cat.set_ordered(ordered, inplace=True) if not inplace: return cat def rename_categories(self, new_categories, inplace=False): """ Renames categories. The new categories has to be a list-like object. All items must be unique and the number of items in the new categories must be the same as the number of items in the old categories. Raises ------ ValueError If the new categories do not have the same number of items than the current categories or do not validate as categories Parameters ---------- new_categories : Index-like The renamed categories. inplace : boolean (default: False) Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. Returns ------- cat : Categorical with renamed categories added or None if inplace. See also -------- reorder_categories add_categories remove_categories remove_unused_categories set_categories """ cat = self if inplace else self.copy() cat.categories = new_categories if not inplace: return cat def reorder_categories(self, new_categories, ordered=None, inplace=False): """ Reorders categories as specified in new_categories. `new_categories` need to include all old categories and no new category items. Raises ------ ValueError If the new categories do not contain all old category items or any new ones Parameters ---------- new_categories : Index-like The categories in new order. ordered : boolean, optional Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. inplace : boolean (default: False) Whether or not to reorder the categories inplace or return a copy of this categorical with reordered categories. Returns ------- cat : Categorical with reordered categories or None if inplace. See also -------- rename_categories add_categories remove_categories remove_unused_categories set_categories """ if set(self._categories) != set(new_categories): raise ValueError("items in new_categories are not the same as in " "old categories") return self.set_categories(new_categories, ordered=ordered, inplace=inplace) def add_categories(self, new_categories, inplace=False): """ Add new categories. `new_categories` will be included at the last/highest place in the categories and will be unused directly after this call. Raises ------ ValueError If the new categories include old categories or do not validate as categories Parameters ---------- new_categories : category or list-like of category The new categories to be included. inplace : boolean (default: False) Whether or not to add the categories inplace or return a copy of this categorical with added categories. Returns ------- cat : Categorical with new categories added or None if inplace. See also -------- rename_categories reorder_categories remove_categories remove_unused_categories set_categories """ if not is_list_like(new_categories): new_categories = [new_categories] already_included = set(new_categories) & set(self._categories) if len(already_included) != 0: msg = ("new categories must not include old categories: %s" % str(already_included)) raise ValueError(msg) new_categories = list(self._categories) + list(new_categories) cat = self if inplace else self.copy() cat._categories = self._validate_categories(new_categories) cat._codes = _coerce_indexer_dtype(cat._codes, new_categories) if not inplace: return cat def remove_categories(self, removals, inplace=False): """ Removes the specified categories. `removals` must be included in the old categories. Values which were in the removed categories will be set to NaN Raises ------ ValueError If the removals are not contained in the categories Parameters ---------- removals : category or list of categories The categories which should be removed. inplace : boolean (default: False) Whether or not to remove the categories inplace or return a copy of this categorical with removed categories. Returns ------- cat : Categorical with removed categories or None if inplace. See also -------- rename_categories reorder_categories add_categories remove_unused_categories set_categories """ if not is_list_like(removals): removals = [removals] removal_set = set(list(removals)) not_included = removal_set - set(self._categories) new_categories = [c for c in self._categories if c not in removal_set] # GH 10156 if any(isnull(removals)): not_included = [x for x in not_included if notnull(x)] new_categories = [x for x in new_categories if notnull(x)] if len(not_included) != 0: raise ValueError("removals must all be in old categories: %s" % str(not_included)) return self.set_categories(new_categories, ordered=self.ordered, rename=False, inplace=inplace) def remove_unused_categories(self, inplace=False): """ Removes categories which are not used. Parameters ---------- inplace : boolean (default: False) Whether or not to drop unused categories inplace or return a copy of this categorical with unused categories dropped. Returns ------- cat : Categorical with unused categories dropped or None if inplace. See also -------- rename_categories reorder_categories add_categories remove_categories set_categories """ cat = self if inplace else self.copy() idx, inv = np.unique(cat._codes, return_inverse=True) if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 cat._categories = cat.categories.take(idx) cat._codes = _coerce_indexer_dtype(inv, self._categories) if not inplace: return cat def map(self, mapper): """ Apply mapper function to its categories (not codes). Parameters ---------- mapper : callable Function to be applied. When all categories are mapped to different categories, the result will be Categorical which has the same order property as the original. Otherwise, the result will be np.ndarray. Returns ------- applied : Categorical or np.ndarray. """ new_categories = self.categories.map(mapper) try: return Categorical.from_codes(self._codes.copy(), categories=new_categories, ordered=self.ordered) except ValueError: return np.take(new_categories, self._codes) __eq__ = _cat_compare_op('__eq__') __ne__ = _cat_compare_op('__ne__') __lt__ = _cat_compare_op('__lt__') __gt__ = _cat_compare_op('__gt__') __le__ = _cat_compare_op('__le__') __ge__ = _cat_compare_op('__ge__') # for Series/ndarray like compat @property def shape(self): """ Shape of the Categorical. For internal compatibility with numpy arrays. Returns ------- shape : tuple """ return tuple([len(self._codes)]) def shift(self, periods): """ Shift Categorical by desired number of periods. Parameters ---------- periods : int Number of periods to move, can be positive or negative Returns ------- shifted : Categorical """ # since categoricals always have ndim == 1, an axis parameter # doesnt make any sense here. codes = self.codes if codes.ndim > 1: raise NotImplementedError("Categorical with ndim > 1.") if np.prod(codes.shape) and (periods != 0): codes = np.roll(codes, _ensure_platform_int(periods), axis=0) if periods > 0: codes[:periods] = -1 else: codes[periods:] = -1 return Categorical.from_codes(codes, categories=self.categories, ordered=self.ordered)
[docs] def __array__(self, dtype=None): """ The numpy array interface. Returns ------- values : numpy array A numpy array of either the specified dtype or, if dtype==None (default), the same dtype as categorical.categories.dtype """ ret = take_1d(self.categories.values, self._codes) if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) return ret
def __setstate__(self, state): """Necessary for making this object picklable""" if not isinstance(state, dict): raise Exception('invalid pickle state') # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: state['_categories'] = self._validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = _coerce_indexer_dtype(state.pop('labels'), state['_categories']) # 0.16.0 ordered change if '_ordered' not in state: # >=15.0 < 0.16.0 if 'ordered' in state: state['_ordered'] = state.pop('ordered') else: state['_ordered'] = False for k, v in compat.iteritems(state): setattr(self, k, v) @property def T(self): return self @property def nbytes(self): return self._codes.nbytes + self._categories.values.nbytes def memory_usage(self, deep=False): """ Memory usage of my values Parameters ---------- deep : bool Introspect the data deeply, interrogate `object` dtypes for system-level memory consumption Returns ------- bytes used Notes ----- Memory usage does not include memory consumed by elements that are not components of the array if deep=False See Also -------- numpy.ndarray.nbytes """ return self._codes.nbytes + self._categories.memory_usage(deep=deep) @Substitution(klass='Categorical', value='v') @Appender(_shared_docs['searchsorted']) def searchsorted(self, v, side='left', sorter=None): if not self.ordered: raise ValueError("Categorical not ordered\nyou can use " ".as_ordered() to change the Categorical to an " "ordered one") from pandas.core.series import Series values_as_codes = self.categories.values.searchsorted( Series(v).values, side=side) return self.codes.searchsorted(values_as_codes, sorter=sorter) def isnull(self): """ Detect missing values Both missing values (-1 in .codes) and NA as a category are detected. Returns ------- a boolean array of whether my values are null See also -------- pandas.isnull : pandas version Categorical.notnull : boolean inverse of Categorical.isnull """ ret = self._codes == -1 # String/object and float categories can hold np.nan if self.categories.dtype.kind in ['S', 'O', 'f']: if np.nan in self.categories: nan_pos = np.where(isnull(self.categories))[0] # we only have one NA in categories ret = np.logical_or(ret, self._codes == nan_pos) return ret def notnull(self): """ Reverse of isnull Both missing values (-1 in .codes) and NA as a category are detected as null. Returns ------- a boolean array of whether my values are not null See also -------- pandas.notnull : pandas version Categorical.isnull : boolean inverse of Categorical.notnull """ return ~self.isnull() def put(self, *args, **kwargs): """ Replace specific elements in the Categorical with given values. """ raise NotImplementedError(("'put' is not yet implemented " "for Categorical")) def dropna(self): """ Return the Categorical without null values. Both missing values (-1 in .codes) and NA as a category are detected. NA is removed from the categories if present. Returns ------- valid : Categorical """ result = self[self.notnull()] if isnull(result.categories).any(): result = result.remove_categories([np.nan]) return result def value_counts(self, dropna=True): """ Returns a Series containing counts of each category. Every category will have an entry, even those with a count of 0. Parameters ---------- dropna : boolean, default True Don't include counts of NaN, even if NaN is a category. Returns ------- counts : Series """ from numpy import bincount from pandas.types.missing import isnull from pandas.core.series import Series from pandas.core.index import CategoricalIndex obj = (self.remove_categories([np.nan]) if dropna and isnull(self.categories).any() else self) code, cat = obj._codes, obj.categories ncat, mask = len(cat), 0 <= code ix, clean = np.arange(ncat), mask.all() if dropna or clean: obs = code if clean else code[mask] count = bincount(obs, minlength=ncat or None) else: count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) ix = Categorical(ix, categories=cat, ordered=obj.ordered, fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype='int64') def get_values(self): """ Return the values. For internal compatibility with pandas formatting. Returns ------- values : numpy array A numpy array of the same dtype as categorical.categories.dtype or Index if datetime / periods """ # if we are a datetime and period index, return Index to keep metadata if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) return np.array(self) def check_for_ordered(self, op): """ assert that we are ordered """ if not self.ordered: raise TypeError("Categorical is not ordered for operation {op}\n" "you can use .as_ordered() to change the " "Categorical to an ordered one\n".format(op=op)) def argsort(self, ascending=True, *args, **kwargs): """ Returns the indices that would sort the Categorical instance if 'sort_values' was called. This function is implemented to provide compatibility with numpy ndarray objects. While an ordering is applied to the category values, arg-sorting in this context refers more to organizing and grouping together based on matching category values. Thus, this function can be called on an unordered Categorical instance unlike the functions 'Categorical.min' and 'Categorical.max'. Returns ------- argsorted : numpy array See also -------- numpy.ndarray.argsort """ ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) result = np.argsort(self._codes.copy(), **kwargs) if not ascending: result = result[::-1] return result def sort_values(self, inplace=False, ascending=True, na_position='last'): """ Sorts the Categorical by category value returning a new Categorical by default. While an ordering is applied to the category values, sorting in this context refers more to organizing and grouping together based on matching category values. Thus, this function can be called on an unordered Categorical instance unlike the functions 'Categorical.min' and 'Categorical.max'. Parameters ---------- inplace : boolean, default False Do operation in place. ascending : boolean, default True Order ascending. Passing False orders descending. The ordering parameter provides the method by which the category values are organized. na_position : {'first', 'last'} (optional, default='last') 'first' puts NaNs at the beginning 'last' puts NaNs at the end Returns ------- y : Categorical or None See Also -------- Categorical.sort Examples -------- >>> c = pd.Categorical([1, 2, 2, 1, 5]) >>> c [1, 2, 2, 1, 5] Categories (3, int64): [1, 2, 5] >>> c.sort_values() [1, 1, 2, 2, 5] Categories (3, int64): [1, 2, 5] >>> c.sort_values(ascending=False) [5, 2, 2, 1, 1] Categories (3, int64): [1, 2, 5] Inplace sorting can be done as well: >>> c.sort_values(inplace=True) >>> c [1, 1, 2, 2, 5] Categories (3, int64): [1, 2, 5] >>> >>> c = pd.Categorical([1, 2, 2, 1, 5]) 'sort_values' behaviour with NaNs. Note that 'na_position' is independent of the 'ascending' parameter: >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) >>> c [NaN, 2.0, 2.0, NaN, 5.0] Categories (2, int64): [2, 5] >>> c.sort_values() [2.0, 2.0, 5.0, NaN, NaN] Categories (2, int64): [2, 5] >>> c.sort_values(ascending=False) [5.0, 2.0, 2.0, NaN, NaN] Categories (2, int64): [2, 5] >>> c.sort_values(na_position='first') [NaN, NaN, 2.0, 2.0, 5.0] Categories (2, int64): [2, 5] >>> c.sort_values(ascending=False, na_position='first') [NaN, NaN, 5.0, 2.0, 2.0] Categories (2, int64): [2, 5] """ if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) codes = np.sort(self._codes) if not ascending: codes = codes[::-1] # NaN handling na_mask = (codes == -1) if na_mask.any(): n_nans = len(codes[na_mask]) if na_position == "first": # in this case sort to the front new_codes = codes.copy() new_codes[0:n_nans] = -1 new_codes[n_nans:] = codes[~na_mask] codes = new_codes elif na_position == "last": # ... and to the end new_codes = codes.copy() pos = len(codes) - n_nans new_codes[0:pos] = codes[~na_mask] new_codes[pos:] = -1 codes = new_codes if inplace: self._codes = codes return else: return Categorical(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) def order(self, inplace=False, ascending=True, na_position='last'): """ DEPRECATED: use :meth:`Categorical.sort_values`. That function is entirely equivalent to this one. See Also -------- Categorical.sort_values """ warn("order is deprecated, use sort_values(...)", FutureWarning, stacklevel=2) return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position) def sort(self, inplace=True, ascending=True, na_position='last', **kwargs): """ DEPRECATED: use :meth:`Categorical.sort_values`. That function is just like this one, except that a new Categorical is returned by default, so make sure to pass in 'inplace=True' to get inplace sorting. See Also -------- Categorical.sort_values """ warn("sort is deprecated, use sort_values(...)", FutureWarning, stacklevel=2) nv.validate_sort(tuple(), kwargs) return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position) def ravel(self, order='C'): """ Return a flattened (numpy) array. For internal compatibility with numpy arrays. Returns ------- raveled : numpy array """ return np.array(self) def view(self): """Return a view of myself. For internal compatibility with numpy arrays. Returns ------- view : Categorical Returns `self`! """ return self def to_dense(self): """Return my 'dense' representation For internal compatibility with numpy arrays. Returns ------- dense : array """ return np.asarray(self) @deprecate_kwarg(old_arg_name='fill_value', new_arg_name='value') def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. Parameters ---------- method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap value : scalar Value to use to fill holes (e.g. 0) limit : int, default None (Not implemented yet for Categorical!) If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Returns ------- filled : Categorical with NA/NaN filled """ if value is None: value = np.nan if limit is not None: raise NotImplementedError("specifying a limit for fillna has not " "been implemented yet") values = self._codes # Make sure that we also get NA in categories if self.categories.dtype.kind in ['S', 'O', 'f']: if np.nan in self.categories: values = values.copy() nan_pos = np.where(isnull(self.categories))[0] # we only have one NA in categories values[values == nan_pos] = -1 # pad / bfill if method is not None: values = self.to_dense().reshape(-1, len(self)) values = interpolate_2d(values, method, 0, None, value).astype(self.categories.dtype)[0] values = _get_codes_for_values(values, self.categories) else: if not isnull(value) and value not in self.categories: raise ValueError("fill value must be in categories") mask = values == -1 if mask.any(): values = values.copy() values[mask] = self.categories.get_loc(value) return Categorical(values, categories=self.categories, ordered=self.ordered, fastpath=True) def take_nd(self, indexer, allow_fill=True, fill_value=None): """ Take the codes by the indexer, fill with the fill_value. For internal compatibility with numpy arrays. """ # filling must always be None/nan here # but is passed thru internally assert isnull(fill_value) codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) result = Categorical(codes, categories=self.categories, ordered=self.ordered, fastpath=True) return result take = take_nd def _slice(self, slicer): """ Return a slice of myself. For internal compatibility with numpy arrays. """ # only allow 1 dimensional slicing, but can # in a 2-d case be passd (slice(None),....) if isinstance(slicer, tuple) and len(slicer) == 2: if not is_null_slice(slicer[0]): raise AssertionError("invalid slicing for a 1-ndim " "categorical") slicer = slicer[1] _codes = self._codes[slicer] return Categorical(values=_codes, categories=self.categories, ordered=self.ordered, fastpath=True) def __len__(self): """The length of this Categorical.""" return len(self._codes) def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(self.get_values()) def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default footer) """ num = max_vals // 2 head = self[:num]._get_repr(length=False, footer=False) tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) result = '%s, ..., %s' % (head[:-1], tail[1:]) if footer: result = '%s\n%s' % (result, self._repr_footer()) return compat.text_type(result) def _repr_categories(self): """ return the base repr for the categories """ max_categories = (10 if get_option("display.max_categories") == 0 else get_option("display.max_categories")) from pandas.formats import format as fmt if len(self.categories) > max_categories: num = max_categories // 2 head = fmt.format_array(self.categories[:num], None) tail = fmt.format_array(self.categories[-num:], None) category_strs = head + ["..."] + tail else: category_strs = fmt.format_array(self.categories, None) # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] return category_strs def _repr_categories_info(self): """ Returns a string representation of the footer.""" category_strs = self._repr_categories() dtype = getattr(self.categories, 'dtype_str', str(self.categories.dtype)) levheader = "Categories (%d, %s): " % (len(self.categories), dtype) width, height = get_terminal_size() max_width = get_option("display.width") or width if com.in_ipython_frontend(): # 0 = no breaks max_width = 0 levstring = "" start = True cur_col_len = len(levheader) # header sep_len, sep = (3, " < ") if self.ordered else (2, ", ") linesep = sep.rstrip() + "\n" # remove whitespace for val in category_strs: if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: levstring += linesep + (" " * (len(levheader) + 1)) cur_col_len = len(levheader) + 1 # header + a whitespace elif not start: levstring += sep cur_col_len += len(val) levstring += val start = False # replace to simple save space by return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" def _repr_footer(self): return u('Length: %d\n%s') % (len(self), self._repr_categories_info()) def _get_repr(self, length=True, na_rep='NaN', footer=True): from pandas.formats import format as fmt formatter = fmt.CategoricalFormatter(self, length=length, na_rep=na_rep, footer=footer) result = formatter.to_string() return compat.text_type(result) def __unicode__(self): """ Unicode representation. """ _maxlen = 10 if len(self._codes) > _maxlen: result = self._tidy_repr(_maxlen) elif len(self._codes) > 0: result = self._get_repr(length=len(self) > _maxlen) else: result = ('[], %s' % self._get_repr(length=False, footer=True, ).replace("\n", ", ")) return result def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype """ if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': indexer = indexer.astype(self._codes.dtype) return indexer def __getitem__(self, key): """ Return an item. """ if isinstance(key, (int, np.integer)): i = self._codes[key] if i == -1: return np.nan else: return self.categories[i] else: return Categorical(values=self._codes[key], categories=self.categories, ordered=self.ordered, fastpath=True) def __setitem__(self, key, value): """ Item assignment. Raises ------ ValueError If (one or more) Value is not in categories or if a assigned `Categorical` does not have the same categories """ # require identical categories set if isinstance(value, Categorical): if not value.categories.equals(self.categories): raise ValueError("Cannot set a Categorical with another, " "without identical categories") rvalue = value if is_list_like(value) else [value] from pandas import Index to_add = Index(rvalue).difference(self.categories) # no assignments of values not in categories, but it's always ok to set # something to np.nan if len(to_add) and not isnull(to_add).all(): raise ValueError("Cannot setitem on a Categorical with a new " "category, set the categories first") # set by position if isinstance(key, (int, np.integer)): pass # tuple of indexers (dataframe) elif isinstance(key, tuple): # only allow 1 dimensional slicing, but can # in a 2-d case be passd (slice(None),....) if len(key) == 2: if not is_null_slice(key[0]): raise AssertionError("invalid slicing for a 1-ndim " "categorical") key = key[1] elif len(key) == 1: key = key[0] else: raise AssertionError("invalid slicing for a 1-ndim " "categorical") # slicing in Series or Categorical elif isinstance(key, slice): pass # Array of True/False in Series or Categorical else: # There is a bug in numpy, which does not accept a Series as a # indexer # https://github.com/pydata/pandas/issues/6168 # https://github.com/numpy/numpy/issues/4240 -> fixed in numpy 1.9 # FIXME: remove when numpy 1.9 is the lowest numpy version pandas # accepts... key = np.asarray(key) lindexer = self.categories.get_indexer(rvalue) # FIXME: the following can be removed after GH7820 is fixed: # https://github.com/pydata/pandas/issues/7820 # float categories do currently return -1 for np.nan, even if np.nan is # included in the index -> "repair" this here if isnull(rvalue).any() and isnull(self.categories).any(): nan_pos = np.where(isnull(self.categories))[0] lindexer[lindexer == -1] = nan_pos lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer # reduction ops # def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): """ perform the reduction type operation """ func = getattr(self, name, None) if func is None: raise TypeError("Categorical cannot perform the operation " "{op}".format(op=name)) return func(numeric_only=numeric_only, **kwds) def min(self, numeric_only=None, **kwargs): """ The minimum value of the object. Only ordered `Categoricals` have a minimum! Raises ------ TypeError If the `Categorical` is not `ordered`. Returns ------- min : the minimum of this `Categorical` """ self.check_for_ordered('min') if numeric_only: good = self._codes != -1 pointer = self._codes[good].min(**kwargs) else: pointer = self._codes.min(**kwargs) if pointer == -1: return np.nan else: return self.categories[pointer] def max(self, numeric_only=None, **kwargs): """ The maximum value of the object. Only ordered `Categoricals` have a maximum! Raises ------ TypeError If the `Categorical` is not `ordered`. Returns ------- max : the maximum of this `Categorical` """ self.check_for_ordered('max') if numeric_only: good = self._codes != -1 pointer = self._codes[good].max(**kwargs) else: pointer = self._codes.max(**kwargs) if pointer == -1: return np.nan else: return self.categories[pointer] def mode(self): """ Returns the mode(s) of the Categorical. Empty if nothing occurs at least 2 times. Always returns `Categorical` even if only one value. Returns ------- modes : `Categorical` (sorted) """ import pandas.hashtable as htable good = self._codes != -1 values = sorted(htable.mode_int64(_ensure_int64(self._codes[good]))) result = Categorical(values=values, categories=self.categories, ordered=self.ordered, fastpath=True) return result def unique(self): """ Return the ``Categorical`` which ``categories`` and ``codes`` are unique. Unused categories are NOT returned. - unordered category: values and categories are sorted by appearance order. - ordered category: values are sorted by appearance order, categories keeps existing order. Returns ------- unique values : ``Categorical`` """ from pandas.core.nanops import unique1d # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) cat = self.copy() # keep nan in codes cat._codes = unique_codes # exclude nan from indexer for categories take_codes = unique_codes[unique_codes != -1] if self.ordered: take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) def equals(self, other): """ Returns True if categorical arrays are equal. Parameters ---------- other : `Categorical` Returns ------- are_equal : boolean """ return (self.is_dtype_equal(other) and np.array_equal(self._codes, other._codes)) def is_dtype_equal(self, other): """ Returns True if categoricals are the same dtype same categories, and same ordered Parameters ---------- other : Categorical Returns ------- are_equal : boolean """ try: return (self.categories.equals(other.categories) and self.ordered == other.ordered) except (AttributeError, TypeError): return False def describe(self): """ Describes this Categorical Returns ------- description: `DataFrame` A dataframe with frequency and counts by category. """ counts = self.value_counts(dropna=False) freqs = counts / float(counts.sum()) from pandas.tools.merge import concat result = concat([counts, freqs], axis=1) result.columns = ['counts', 'freqs'] result.index.name = 'categories' return result def repeat(self, repeats, *args, **kwargs): """ Repeat elements of a Categorical. See also -------- numpy.ndarray.repeat """ nv.validate_repeat(args, kwargs) codes = self._codes.repeat(repeats) return Categorical(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True)
# The Series.cat accessor class CategoricalAccessor(PandasDelegate, NoNewAttributesMixin): """ Accessor object for categorical properties of the Series values. Be aware that assigning to `categories` is a inplace operation, while all methods return new categorical data per default (but can be called with `inplace=True`). Examples -------- >>> s.cat.categories >>> s.cat.categories = list('abc') >>> s.cat.rename_categories(list('cab')) >>> s.cat.reorder_categories(list('cab')) >>> s.cat.add_categories(['d','e']) >>> s.cat.remove_categories(['d']) >>> s.cat.remove_unused_categories() >>> s.cat.set_categories(list('abcde')) >>> s.cat.as_ordered() >>> s.cat.as_unordered() """ def __init__(self, values, index): self.categorical = values self.index = index self._freeze() def _delegate_property_get(self, name): return getattr(self.categorical, name) def _delegate_property_set(self, name, new_values): return setattr(self.categorical, name, new_values) @property def codes(self): from pandas import Series return Series(self.categorical.codes, index=self.index) def _delegate_method(self, name, *args, **kwargs): from pandas import Series method = getattr(self.categorical, name) res = method(*args, **kwargs) if res is not None: return Series(res, index=self.index) CategoricalAccessor._add_delegate_accessors(delegate=Categorical, accessors=["categories", "ordered"], typ='property') CategoricalAccessor._add_delegate_accessors(delegate=Categorical, accessors=[ "rename_categories", "reorder_categories", "add_categories", "remove_categories", "remove_unused_categories", "set_categories", "as_ordered", "as_unordered"], typ='method') # utility routines def _get_codes_for_values(values, categories): """ utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables if not is_dtype_equal(values.dtype, categories.dtype): values = _ensure_object(values) categories = _ensure_object(categories) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) (_, _), cats = _get_data_algo(categories, _hashtables) t = hash_klass(len(cats)) t.map_locations(cats) return _coerce_indexer_dtype(t.lookup(vals), cats) def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): return list_like if isinstance(list_like, list): return list_like if (is_sequence(list_like) or isinstance(list_like, tuple) or isinstance(list_like, types.GeneratorType)): return list(list_like) elif is_scalar(list_like): return [list_like] else: # is this reached? return [list_like]