Source code for pandas.core.algorithms

"""
Generic data algorithms. This module is experimental at the moment and not
intended for public consumption
"""
from __future__ import division
from warnings import warn
import numpy as np

from pandas import compat, lib, tslib, _np_version_under1p8
from pandas.types.cast import _maybe_promote
from pandas.types.generic import (ABCSeries, ABCIndex, ABCPeriodIndex,
                                  ABCDatetimeIndex)
from pandas.types.common import (is_integer_dtype,
                                 is_int64_dtype,
                                 is_categorical_dtype,
                                 is_extension_type,
                                 is_datetimetz,
                                 is_period_arraylike,
                                 is_datetime_or_timedelta_dtype,
                                 is_float_dtype,
                                 needs_i8_conversion,
                                 is_categorical,
                                 is_datetime64_dtype,
                                 is_timedelta64_dtype,
                                 is_scalar,
                                 _ensure_platform_int,
                                 _ensure_object,
                                 _ensure_float64,
                                 _ensure_int64,
                                 is_list_like)
from pandas.types.missing import isnull

import pandas.core.common as com
import pandas.algos as algos
import pandas.hashtable as htable
from pandas.compat import string_types
from pandas.tslib import iNaT


# --------------- #
# top-level algos #
# --------------- #

def match(to_match, values, na_sentinel=-1):
    """
    Compute locations of to_match into values

    Parameters
    ----------
    to_match : array-like
        values to find positions of
    values : array-like
        Unique set of values
    na_sentinel : int, default -1
        Value to mark "not found"

    Examples
    --------

    Returns
    -------
    match : ndarray of integers
    """
    values = com._asarray_tuplesafe(values)
    if issubclass(values.dtype.type, string_types):
        values = np.array(values, dtype='O')

    f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
    result = _hashtable_algo(f, values.dtype, np.int64)

    if na_sentinel != -1:

        # replace but return a numpy array
        # use a Series because it handles dtype conversions properly
        from pandas.core.series import Series
        result = Series(result.ravel()).replace(-1, na_sentinel).values.\
            reshape(result.shape)

    return result


def _match_generic(values, index, table_type, type_caster):
    values = type_caster(values)
    index = type_caster(index)
    table = table_type(min(len(index), 1000000))
    table.map_locations(index)
    return table.lookup(values)


def unique(values):
    """
    Compute unique values (not necessarily sorted) efficiently from input array
    of values

    Parameters
    ----------
    values : array-like

    Returns
    -------
    uniques
    """
    values = com._asarray_tuplesafe(values)

    f = lambda htype, caster: _unique_generic(values, htype, caster)
    return _hashtable_algo(f, values.dtype)


def _unique_generic(values, table_type, type_caster):
    values = type_caster(values)
    table = table_type(min(len(values), 1000000))
    uniques = table.unique(values)
    return type_caster(uniques)


def isin(comps, values):
    """
    Compute the isin boolean array

    Parameters
    ----------
    comps: array-like
    values: array-like

    Returns
    -------
    boolean array same length as comps
    """

    if not is_list_like(comps):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(comps).__name__))
    comps = np.asarray(comps)
    if not is_list_like(values):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(values).__name__))
    if not isinstance(values, np.ndarray):
        values = list(values)

    # GH11232
    # work-around for numpy < 1.8 and comparisions on py3
    # faster for larger cases to use np.in1d
    if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
        f = lambda x, y: np.in1d(x, np.asarray(list(y)))
    else:
        f = lambda x, y: lib.ismember_int64(x, set(y))

    # may need i8 conversion for proper membership testing
    if is_datetime64_dtype(comps):
        from pandas.tseries.tools import to_datetime
        values = to_datetime(values)._values.view('i8')
        comps = comps.view('i8')
    elif is_timedelta64_dtype(comps):
        from pandas.tseries.timedeltas import to_timedelta
        values = to_timedelta(values)._values.view('i8')
        comps = comps.view('i8')
    elif is_int64_dtype(comps):
        pass
    else:
        f = lambda x, y: lib.ismember(x, set(values))

    return f(comps, values)


def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
    """
    Sort ``values`` and reorder corresponding ``labels``.
    ``values`` should be unique if ``labels`` is not None.
    Safe for use with mixed types (int, str), orders ints before strs.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    values : list-like
        Sequence; must be unique if ``labels`` is not None.
    labels : list_like
        Indices to ``values``. All out of bound indices are treated as
        "not found" and will be masked with ``na_sentinel``.
    na_sentinel : int, default -1
        Value in ``labels`` to mark "not found".
        Ignored when ``labels`` is None.
    assume_unique : bool, default False
        When True, ``values`` are assumed to be unique, which can speed up
        the calculation. Ignored when ``labels`` is None.

    Returns
    -------
    ordered : ndarray
        Sorted ``values``
    new_labels : ndarray
        Reordered ``labels``; returned when ``labels`` is not None.

    Raises
    ------
    TypeError
        * If ``values`` is not list-like or if ``labels`` is neither None
        nor list-like
        * If ``values`` cannot be sorted
    ValueError
        * If ``labels`` is not None and ``values`` contain duplicates.
    """
    if not is_list_like(values):
        raise TypeError("Only list-like objects are allowed to be passed to"
                        "safe_sort as values")
    values = np.array(values, copy=False)

    def sort_mixed(values):
        # order ints before strings, safe in py3
        str_pos = np.array([isinstance(x, string_types) for x in values],
                           dtype=bool)
        nums = np.sort(values[~str_pos])
        strs = np.sort(values[str_pos])
        return _ensure_object(np.concatenate([nums, strs]))

    sorter = None
    if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer':
        # unorderable in py3 if mixed str/int
        ordered = sort_mixed(values)
    else:
        try:
            sorter = values.argsort()
            ordered = values.take(sorter)
        except TypeError:
            # try this anyway
            ordered = sort_mixed(values)

    # labels:

    if labels is None:
        return ordered

    if not is_list_like(labels):
        raise TypeError("Only list-like objects or None are allowed to be"
                        "passed to safe_sort as labels")
    labels = _ensure_platform_int(np.asarray(labels))

    from pandas import Index
    if not assume_unique and not Index(values).is_unique:
        raise ValueError("values should be unique if labels is not None")

    if sorter is None:
        # mixed types
        (hash_klass, _), values = _get_data_algo(values, _hashtables)
        t = hash_klass(len(values))
        t.map_locations(values)
        sorter = _ensure_platform_int(t.lookup(ordered))

    reverse_indexer = np.empty(len(sorter), dtype=np.int_)
    reverse_indexer.put(sorter, np.arange(len(sorter)))

    mask = (labels < -len(values)) | (labels >= len(values)) | \
        (labels == na_sentinel)

    # (Out of bound indices will be masked with `na_sentinel` next, so we may
    # deal with them here without performance loss using `mode='wrap'`.)
    new_labels = reverse_indexer.take(labels, mode='wrap')
    np.putmask(new_labels, mask, na_sentinel)

    return ordered, new_labels


[docs]def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex vals = np.asarray(values) # localize to UTC is_datetimetz_type = is_datetimetz(values) if is_datetimetz_type: values = DatetimeIndex(values) vals = values.asi8 is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) if is_datetimetz_type: # reset tz uniques = values._shallow_copy(uniques) elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series name = getattr(values, 'name', None) if bins is not None: try: from pandas.tools.tile import cut values = Series(values).values cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if is_extension_type(values) and not is_datetimetz(values): # handle Categorical and sparse, # datetime tz can be handeled in ndarray path result = Series(values).values.value_counts(dropna=dropna) result.name = name counts = result.values else: # ndarray path. pass original to handle DatetimeTzBlock keys, counts = _value_counts_arraylike(values, dropna=dropna) from pandas import Index, Series if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(counts.sum()) return result def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period = (isinstance(values, ABCPeriodIndex) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: if isinstance(orig, ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_int64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) mask = isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts def duplicated(values, keep='first'): """ Return boolean ndarray denoting duplicate values .. versionadded:: 0.19.0 Parameters ---------- keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- duplicated : ndarray """ dtype = values.dtype # no need to revert to original type if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype): if isinstance(values, (ABCSeries, ABCIndex)): values = values.values.view(np.int64) else: values = values.view(np.int64) elif is_period_arraylike(values): from pandas.tseries.period import PeriodIndex values = PeriodIndex(values).asi8 elif is_categorical_dtype(dtype): values = values.values.codes elif isinstance(values, (ABCSeries, ABCIndex)): values = values.values if is_integer_dtype(dtype): values = _ensure_int64(values) duplicated = htable.duplicated_int64(values, keep=keep) elif is_float_dtype(dtype): values = _ensure_float64(values) duplicated = htable.duplicated_float64(values, keep=keep) else: values = _ensure_object(values) duplicated = htable.duplicated_object(values, keep=keep) return duplicated def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if is_integer_dtype(values): values = _ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: mask = isnull(values) values = _ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result def rank(values, axis=0, method='average', na_option='keep', ascending=True, pct=False): """ """ if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) ranks = f(values, ties_method=method, ascending=ascending, na_option=na_option, pct=pct) elif values.ndim == 2: f, values = _get_data_algo(values, _rank2d_functions) ranks = f(values, axis=axis, ties_method=method, ascending=ascending, na_option=na_option, pct=pct) return ranks _rank1d_functions = { 'float64': algos.rank_1d_float64, 'int64': algos.rank_1d_int64, 'generic': algos.rank_1d_generic } _rank2d_functions = { 'float64': algos.rank_2d_float64, 'int64': algos.rank_2d_int64, 'generic': algos.rank_2d_generic } def quantile(x, q, interpolation_method='fraction'): """ Compute sample quantile or quantiles of the input array. For example, q=0.5 computes the median. The `interpolation_method` parameter supports three values, namely `fraction` (default), `lower` and `higher`. Interpolation is done only, if the desired quantile lies between two data points `i` and `j`. For `fraction`, the result is an interpolated value between `i` and `j`; for `lower`, the result is `i`, for `higher` the result is `j`. Parameters ---------- x : ndarray Values from which to extract score. q : scalar or array Percentile at which to extract score. interpolation_method : {'fraction', 'lower', 'higher'}, optional This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: - fraction: `i + (j - i)*fraction`, where `fraction` is the fractional part of the index surrounded by `i` and `j`. -lower: `i`. - higher: `j`. Returns ------- score : float Score at percentile. Examples -------- >>> from scipy import stats >>> a = np.arange(100) >>> stats.scoreatpercentile(a, 50) 49.5 """ x = np.asarray(x) mask = isnull(x) x = x[~mask] values = np.sort(x) def _get_score(at): if len(values) == 0: return np.nan idx = at * (len(values) - 1) if idx % 1 == 0: score = values[int(idx)] else: if interpolation_method == 'fraction': score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1) elif interpolation_method == 'lower': score = values[np.floor(idx)] elif interpolation_method == 'higher': score = values[np.ceil(idx)] else: raise ValueError("interpolation_method can only be 'fraction' " ", 'lower' or 'higher'") return score if is_scalar(q): return _get_score(q) else: q = np.asarray(q, np.float64) return algos.arrmap_float64(q, _get_score) def _interpolate(a, b, fraction): """Returns the point at the given fraction between a and b, where 'fraction' must be between 0 and 1. """ return a + (b - a) * fraction def nsmallest(arr, n, keep='first'): """ Find the indices of the n smallest values of a numpy array. Note: Fails silently with NaN. """ if keep == 'last': arr = arr[::-1] narr = len(arr) n = min(n, narr) sdtype = str(arr.dtype) arr = arr.view(_dtype_map.get(sdtype, sdtype)) kth_val = algos.kth_smallest(arr.copy(), n - 1) return _finalize_nsmallest(arr, kth_val, n, keep, narr) def nlargest(arr, n, keep='first'): """ Find the indices of the n largest values of a numpy array. Note: Fails silently with NaN. """ sdtype = str(arr.dtype) arr = arr.view(_dtype_map.get(sdtype, sdtype)) return nsmallest(-arr, n, keep=keep) def select_n_slow(dropped, n, keep, method): reverse_it = (keep == 'last' or method == 'nlargest') ascending = method == 'nsmallest' slc = np.s_[::-1] if reverse_it else np.s_[:] return dropped[slc].sort_values(ascending=ascending).head(n) _select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest} def select_n(series, n, keep, method): """Implement n largest/smallest. Parameters ---------- n : int keep : {'first', 'last'}, default 'first' method : str, {'nlargest', 'nsmallest'} Returns ------- nordered : Series """ dtype = series.dtype if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64, np.timedelta64)): raise TypeError("Cannot use method %r with dtype %s" % (method, dtype)) if keep not in ('first', 'last'): raise ValueError('keep must be either "first", "last"') if n <= 0: return series[[]] dropped = series.dropna() if n >= len(series): return select_n_slow(dropped, n, keep, method) inds = _select_methods[method](dropped.values, n, keep) return dropped.iloc[inds] def _finalize_nsmallest(arr, kth_val, n, keep, narr): ns, = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind='mergesort')][:n] if keep == 'last': # reverse indices return narr - 1 - inds else: return inds _dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'} # ------- # # helpers # # ------- # def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_integer_dtype(dtype): return f(htable.Int64HashTable, _ensure_int64) elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) else: return f(htable.PyObjectHashTable, _ensure_object) _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), 'int64': (htable.Int64HashTable, htable.Int64Vector), 'generic': (htable.PyObjectHashTable, htable.ObjectVector) } def _get_data_algo(values, func_map): if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif is_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) else: f = func_map['generic'] values = _ensure_object(values) return f, values # ---- # # take # # ---- # def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): def wrapper(arr, indexer, out, fill_value=np.nan): if arr_dtype is not None: arr = arr.view(arr_dtype) if out_dtype is not None: out = out.view(out_dtype) if fill_wrap is not None: fill_value = fill_wrap(fill_value) f(arr, indexer, out, fill_value=fill_value) return wrapper def _convert_wrapper(f, conv_dtype): def wrapper(arr, indexer, out, fill_value=np.nan): arr = arr.astype(conv_dtype) f(arr, indexer, out, fill_value=fill_value) return wrapper def _take_2d_multi_generic(arr, indexer, out, fill_value, mask_info): # this is not ideal, performance-wise, but it's better than raising # an exception (best to optimize in Cython to avoid getting here) row_idx, col_idx = indexer if mask_info is not None: (row_mask, col_mask), (row_needs, col_needs) = mask_info else: row_mask = row_idx == -1 col_mask = col_idx == -1 row_needs = row_mask.any() col_needs = col_mask.any() if fill_value is not None: if row_needs: out[row_mask, :] = fill_value if col_needs: out[:, col_mask] = fill_value for i in range(len(row_idx)): u_ = row_idx[i] for j in range(len(col_idx)): v = col_idx[j] out[i, j] = arr[u_, v] def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): if mask_info is not None: mask, needs_masking = mask_info else: mask = indexer == -1 needs_masking = mask.any() if arr.dtype != out.dtype: arr = arr.astype(out.dtype) if arr.shape[axis] > 0: arr.take(_ensure_platform_int(indexer), axis=axis, out=out) if needs_masking: outindexer = [slice(None)] * arr.ndim outindexer[axis] = mask out[tuple(outindexer)] = fill_value _take_1d_dict = { ('int8', 'int8'): algos.take_1d_int8_int8, ('int8', 'int32'): algos.take_1d_int8_int32, ('int8', 'int64'): algos.take_1d_int8_int64, ('int8', 'float64'): algos.take_1d_int8_float64, ('int16', 'int16'): algos.take_1d_int16_int16, ('int16', 'int32'): algos.take_1d_int16_int32, ('int16', 'int64'): algos.take_1d_int16_int64, ('int16', 'float64'): algos.take_1d_int16_float64, ('int32', 'int32'): algos.take_1d_int32_int32, ('int32', 'int64'): algos.take_1d_int32_int64, ('int32', 'float64'): algos.take_1d_int32_float64, ('int64', 'int64'): algos.take_1d_int64_int64, ('int64', 'float64'): algos.take_1d_int64_float64, ('float32', 'float32'): algos.take_1d_float32_float32, ('float32', 'float64'): algos.take_1d_float32_float64, ('float64', 'float64'): algos.take_1d_float64_float64, ('object', 'object'): algos.take_1d_object_object, ('bool', 'bool'): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), ('bool', 'object'): _view_wrapper(algos.take_1d_bool_object, np.uint8, None), ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper( algos.take_1d_int64_int64, np.int64, np.int64, np.int64) } _take_2d_axis0_dict = { ('int8', 'int8'): algos.take_2d_axis0_int8_int8, ('int8', 'int32'): algos.take_2d_axis0_int8_int32, ('int8', 'int64'): algos.take_2d_axis0_int8_int64, ('int8', 'float64'): algos.take_2d_axis0_int8_float64, ('int16', 'int16'): algos.take_2d_axis0_int16_int16, ('int16', 'int32'): algos.take_2d_axis0_int16_int32, ('int16', 'int64'): algos.take_2d_axis0_int16_int64, ('int16', 'float64'): algos.take_2d_axis0_int16_float64, ('int32', 'int32'): algos.take_2d_axis0_int32_int32, ('int32', 'int64'): algos.take_2d_axis0_int32_int64, ('int32', 'float64'): algos.take_2d_axis0_int32_float64, ('int64', 'int64'): algos.take_2d_axis0_int64_int64, ('int64', 'float64'): algos.take_2d_axis0_int64_float64, ('float32', 'float32'): algos.take_2d_axis0_float32_float32, ('float32', 'float64'): algos.take_2d_axis0_float32_float64, ('float64', 'float64'): algos.take_2d_axis0_float64_float64, ('object', 'object'): algos.take_2d_axis0_object_object, ('bool', 'bool'): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), ('bool', 'object'): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64) } _take_2d_axis1_dict = { ('int8', 'int8'): algos.take_2d_axis1_int8_int8, ('int8', 'int32'): algos.take_2d_axis1_int8_int32, ('int8', 'int64'): algos.take_2d_axis1_int8_int64, ('int8', 'float64'): algos.take_2d_axis1_int8_float64, ('int16', 'int16'): algos.take_2d_axis1_int16_int16, ('int16', 'int32'): algos.take_2d_axis1_int16_int32, ('int16', 'int64'): algos.take_2d_axis1_int16_int64, ('int16', 'float64'): algos.take_2d_axis1_int16_float64, ('int32', 'int32'): algos.take_2d_axis1_int32_int32, ('int32', 'int64'): algos.take_2d_axis1_int32_int64, ('int32', 'float64'): algos.take_2d_axis1_int32_float64, ('int64', 'int64'): algos.take_2d_axis1_int64_int64, ('int64', 'float64'): algos.take_2d_axis1_int64_float64, ('float32', 'float32'): algos.take_2d_axis1_float32_float32, ('float32', 'float64'): algos.take_2d_axis1_float32_float64, ('float64', 'float64'): algos.take_2d_axis1_float64_float64, ('object', 'object'): algos.take_2d_axis1_object_object, ('bool', 'bool'): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), ('bool', 'object'): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64) } _take_2d_multi_dict = { ('int8', 'int8'): algos.take_2d_multi_int8_int8, ('int8', 'int32'): algos.take_2d_multi_int8_int32, ('int8', 'int64'): algos.take_2d_multi_int8_int64, ('int8', 'float64'): algos.take_2d_multi_int8_float64, ('int16', 'int16'): algos.take_2d_multi_int16_int16, ('int16', 'int32'): algos.take_2d_multi_int16_int32, ('int16', 'int64'): algos.take_2d_multi_int16_int64, ('int16', 'float64'): algos.take_2d_multi_int16_float64, ('int32', 'int32'): algos.take_2d_multi_int32_int32, ('int32', 'int64'): algos.take_2d_multi_int32_int64, ('int32', 'float64'): algos.take_2d_multi_int32_float64, ('int64', 'int64'): algos.take_2d_multi_int64_int64, ('int64', 'float64'): algos.take_2d_multi_int64_float64, ('float32', 'float32'): algos.take_2d_multi_float32_float32, ('float32', 'float64'): algos.take_2d_multi_float32_float64, ('float64', 'float64'): algos.take_2d_multi_float64_float64, ('object', 'object'): algos.take_2d_multi_object_object, ('bool', 'bool'): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), ('bool', 'object'): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64) } def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): if ndim <= 2: tup = (arr_dtype.name, out_dtype.name) if ndim == 1: func = _take_1d_dict.get(tup, None) elif ndim == 2: if axis == 0: func = _take_2d_axis0_dict.get(tup, None) else: func = _take_2d_axis1_dict.get(tup, None) if func is not None: return func tup = (out_dtype.name, out_dtype.name) if ndim == 1: func = _take_1d_dict.get(tup, None) elif ndim == 2: if axis == 0: func = _take_2d_axis0_dict.get(tup, None) else: func = _take_2d_axis1_dict.get(tup, None) if func is not None: func = _convert_wrapper(func, out_dtype) return func def func(arr, indexer, out, fill_value=np.nan): indexer = _ensure_int64(indexer) _take_nd_generic(arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info) return func def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ Specialized Cython take which sets NaN values in one pass Parameters ---------- arr : ndarray Input array indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value indicies are filed with fill_value axis : int, default 0 Axis to take from out : ndarray or None, default None Optional output array, must be appropriate type to hold input and fill_value together, if indexer has any -1 value entries; call _maybe_promote to determine this type for any fill_value fill_value : any, default np.nan Fill value to replace -1 values with mask_info : tuple of (ndarray, boolean) If provided, value should correspond to: (indexer != -1, (indexer != -1).any()) If not provided, it will be computed internally if necessary allow_fill : boolean, default True If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. """ # dispatch to internal type takes if is_categorical(arr): return arr.take_nd(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: indexer = _ensure_int64(indexer) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: mask, needs_masking = mask_info else: mask = indexer == -1 needs_masking = mask.any() mask_info = mask, needs_masking if needs_masking: if out is not None and out.dtype != dtype: raise TypeError('Incompatible type for fill_value') else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() flip_order = False if arr.ndim == 2: if arr.flags.f_contiguous: flip_order = True if flip_order: arr = arr.T axis = arr.ndim - axis - 1 if out is not None: out = out.T # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value if out is None: out_shape = list(arr.shape) out_shape[axis] = len(indexer) out_shape = tuple(out_shape) if arr.flags.f_contiguous and axis == arr.ndim - 1: # minor tweak that can make an order-of-magnitude difference # for dataframes initialized directly from 2-d ndarrays # (s.t. df.values is c-contiguous and df._data.blocks[0] is its # f-contiguous transpose) out = np.empty(out_shape, dtype=dtype, order='F') else: out = np.empty(out_shape, dtype=dtype) func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: out = out.T return out take_1d = take_nd def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ Specialized Cython take which sets NaN values in one pass """ if indexer is None or (indexer[0] is None and indexer[1] is None): row_idx = np.arange(arr.shape[0], dtype=np.int64) col_idx = np.arange(arr.shape[1], dtype=np.int64) indexer = row_idx, col_idx dtype, fill_value = arr.dtype, arr.dtype.type() else: row_idx, col_idx = indexer if row_idx is None: row_idx = np.arange(arr.shape[0], dtype=np.int64) else: row_idx = _ensure_int64(row_idx) if col_idx is None: col_idx = np.arange(arr.shape[1], dtype=np.int64) else: col_idx = _ensure_int64(col_idx) indexer = row_idx, col_idx if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: (row_mask, col_mask), (row_needs, col_needs) = mask_info else: row_mask = row_idx == -1 col_mask = col_idx == -1 row_needs = row_mask.any() col_needs = col_mask.any() mask_info = (row_mask, col_mask), (row_needs, col_needs) if row_needs or col_needs: if out is not None and out.dtype != dtype: raise TypeError('Incompatible type for fill_value') else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value if out is None: out_shape = len(row_idx), len(col_idx) out = np.empty(out_shape, dtype=dtype) func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) if func is None and arr.dtype != out.dtype: func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) if func is not None: func = _convert_wrapper(func, out.dtype) if func is None: def func(arr, indexer, out, fill_value=np.nan): _take_2d_multi_generic(arr, indexer, out, fill_value=fill_value, mask_info=mask_info) func(arr, indexer, out=out, fill_value=fill_value) return out # ---- # # diff # # ---- # _diff_special = { 'float64': algos.diff_2d_float64, 'float32': algos.diff_2d_float32, 'int64': algos.diff_2d_int64, 'int32': algos.diff_2d_int32, 'int16': algos.diff_2d_int16, 'int8': algos.diff_2d_int8, } def diff(arr, n, axis=0): """ difference of n between self, analagoust to s-s.shift(n) """ n = int(n) na = np.nan dtype = arr.dtype is_timedelta = False if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view('i8') na = tslib.iNaT is_timedelta = True elif issubclass(dtype.type, np.integer): dtype = np.float64 elif issubclass(dtype.type, np.bool_): dtype = np.object_ dtype = np.dtype(dtype) out_arr = np.empty(arr.shape, dtype=dtype) na_indexer = [slice(None)] * arr.ndim na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) out_arr[tuple(na_indexer)] = na if arr.ndim == 2 and arr.dtype.name in _diff_special: f = _diff_special[arr.dtype.name] f(arr, out_arr, n, axis) else: res_indexer = [slice(None)] * arr.ndim res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) res_indexer = tuple(res_indexer) lag_indexer = [slice(None)] * arr.ndim lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) lag_indexer = tuple(lag_indexer) # need to make sure that we account for na for datelike/timedelta # we don't actually want to subtract these i8 numbers if is_timedelta: res = arr[res_indexer] lag = arr[lag_indexer] mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na) if mask.any(): res = res.copy() res[mask] = 0 lag = lag.copy() lag[mask] = 0 result = res - lag result[mask] = na out_arr[res_indexer] = result else: out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] if is_timedelta: from pandas import TimedeltaIndex out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape( out_arr.shape).astype('timedelta64[ns]') return out_arr