# pylint: disable=E1101,E1103
# pylint: disable=W0703,W0622,W0613,W0201
from pandas.compat import range, zip
from pandas import compat
import itertools
import numpy as np
from pandas.types.common import _ensure_platform_int, is_list_like
from pandas.types.cast import _maybe_promote
from pandas.types.missing import notnull
import pandas.types.concat as _concat
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.sparse import SparseDataFrame, SparseSeries
from pandas.sparse.array import SparseArray
from pandas._sparse import IntIndex
from pandas.core.categorical import Categorical
from pandas.core.groupby import get_group_index, _compress_group_index
import pandas.core.algorithms as algos
import pandas.algos as _algos
from pandas.core.index import MultiIndex, _get_na_value
class _Unstacker(object):
"""
Helper class to unstack data / pivot with multi-level index
Parameters
----------
level : int or str, default last level
Level to "unstack". Accepts a name for the level.
Examples
--------
>>> import pandas as pd
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
... ('two', 'a'), ('two', 'b')])
>>> s = pd.Series(np.arange(1.0, 5.0), index=index)
>>> s
one a 1
b 2
two a 3
b 4
dtype: float64
>>> s.unstack(level=-1)
a b
one 1 2
two 3 4
>>> s.unstack(level=0)
one two
a 1 2
b 3 4
Returns
-------
unstacked : DataFrame
"""
def __init__(self, values, index, level=-1, value_columns=None,
fill_value=None):
self.is_categorical = None
if values.ndim == 1:
if isinstance(values, Categorical):
self.is_categorical = values
values = np.array(values)
values = values[:, np.newaxis]
self.values = values
self.value_columns = value_columns
self.fill_value = fill_value
if value_columns is None and values.shape[1] != 1: # pragma: no cover
raise ValueError('must pass column labels for multi-column data')
self.index = index
if isinstance(self.index, MultiIndex):
if index._reference_duplicate_name(level):
msg = ("Ambiguous reference to {0}. The index "
"names are not unique.".format(level))
raise ValueError(msg)
self.level = self.index._get_level_number(level)
# when index includes `nan`, need to lift levels/strides by 1
self.lift = 1 if -1 in self.index.labels[self.level] else 0
self.new_index_levels = list(index.levels)
self.new_index_names = list(index.names)
self.removed_name = self.new_index_names.pop(self.level)
self.removed_level = self.new_index_levels.pop(self.level)
self._make_sorted_values_labels()
self._make_selectors()
def _make_sorted_values_labels(self):
v = self.level
labs = list(self.index.labels)
levs = list(self.index.levels)
to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]
comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
ngroups = len(obs_ids)
indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
indexer = _ensure_platform_int(indexer)
self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
self.sorted_labels = [l.take(indexer) for l in to_sort]
def _make_selectors(self):
new_levels = self.new_index_levels
# make the mask
remaining_labels = self.sorted_labels[:-1]
level_sizes = [len(x) for x in new_levels]
comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
ngroups = len(obs_ids)
comp_index = _ensure_platform_int(comp_index)
stride = self.index.levshape[self.level] + self.lift
self.full_shape = ngroups, stride
selector = self.sorted_labels[-1] + stride * comp_index + self.lift
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
mask.put(selector, True)
if mask.sum() < len(self.index):
raise ValueError('Index contains duplicate entries, '
'cannot reshape')
self.group_index = comp_index
self.mask = mask
self.unique_groups = obs_ids
self.compressor = comp_index.searchsorted(np.arange(ngroups))
def get_result(self):
# TODO: find a better way than this masking business
values, value_mask = self.get_new_values()
columns = self.get_new_columns()
index = self.get_new_index()
# filter out missing levels
if values.shape[1] > 0:
col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1])
# rare case, level values not observed
if len(obs_ids) < self.full_shape[1]:
inds = (value_mask.sum(0) > 0).nonzero()[0]
values = algos.take_nd(values, inds, axis=1)
columns = columns[inds]
# may need to coerce categoricals here
if self.is_categorical is not None:
categories = self.is_categorical.categories
ordered = self.is_categorical.ordered
values = [Categorical.from_array(values[:, i],
categories=categories,
ordered=ordered)
for i in range(values.shape[-1])]
return DataFrame(values, index=index, columns=columns)
def get_new_values(self):
values = self.values
# place the values
length, width = self.full_shape
stride = values.shape[1]
result_width = width * stride
result_shape = (length, result_width)
# if our mask is all True, then we can use our existing dtype
if self.mask.all():
dtype = values.dtype
new_values = np.empty(result_shape, dtype=dtype)
else:
dtype, fill_value = _maybe_promote(values.dtype, self.fill_value)
new_values = np.empty(result_shape, dtype=dtype)
new_values.fill(fill_value)
new_mask = np.zeros(result_shape, dtype=bool)
# is there a simpler / faster way of doing this?
for i in range(values.shape[1]):
chunk = new_values[:, i * width:(i + 1) * width]
mask_chunk = new_mask[:, i * width:(i + 1) * width]
chunk.flat[self.mask] = self.sorted_values[:, i]
mask_chunk.flat[self.mask] = True
return new_values, new_mask
def get_new_columns(self):
if self.value_columns is None:
if self.lift == 0:
return self.removed_level
lev = self.removed_level
return lev.insert(0, _get_na_value(lev.dtype.type))
stride = len(self.removed_level) + self.lift
width = len(self.value_columns)
propagator = np.repeat(np.arange(width), stride)
if isinstance(self.value_columns, MultiIndex):
new_levels = self.value_columns.levels + (self.removed_level,)
new_names = self.value_columns.names + (self.removed_name,)
new_labels = [lab.take(propagator)
for lab in self.value_columns.labels]
else:
new_levels = [self.value_columns, self.removed_level]
new_names = [self.value_columns.name, self.removed_name]
new_labels = [propagator]
new_labels.append(np.tile(np.arange(stride) - self.lift, width))
return MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)
def get_new_index(self):
result_labels = [lab.take(self.compressor)
for lab in self.sorted_labels[:-1]]
# construct the new index
if len(self.new_index_levels) == 1:
lev, lab = self.new_index_levels[0], result_labels[0]
if (lab == -1).any():
lev = lev.insert(len(lev), _get_na_value(lev.dtype.type))
return lev.take(lab)
return MultiIndex(levels=self.new_index_levels, labels=result_labels,
names=self.new_index_names, verify_integrity=False)
def _unstack_multiple(data, clocs):
from pandas.core.groupby import decons_obs_group_ids
if len(clocs) == 0:
return data
# NOTE: This doesn't deal with hierarchical columns yet
index = data.index
clocs = [index._get_level_number(i) for i in clocs]
rlocs = [i for i in range(index.nlevels) if i not in clocs]
clevels = [index.levels[i] for i in clocs]
clabels = [index.labels[i] for i in clocs]
cnames = [index.names[i] for i in clocs]
rlevels = [index.levels[i] for i in rlocs]
rlabels = [index.labels[i] for i in rlocs]
rnames = [index.names[i] for i in rlocs]
shape = [len(x) for x in clevels]
group_index = get_group_index(clabels, shape, sort=False, xnull=False)
comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels,
xnull=False)
dummy_index = MultiIndex(levels=rlevels + [obs_ids],
labels=rlabels + [comp_ids],
names=rnames + ['__placeholder__'],
verify_integrity=False)
if isinstance(data, Series):
dummy = Series(data.values, index=dummy_index)
unstacked = dummy.unstack('__placeholder__')
new_levels = clevels
new_names = cnames
new_labels = recons_labels
else:
if isinstance(data.columns, MultiIndex):
result = data
for i in range(len(clocs)):
val = clocs[i]
result = result.unstack(val)
clocs = [v if i > v else v - 1 for v in clocs]
return result
dummy = DataFrame(data.values, index=dummy_index, columns=data.columns)
unstacked = dummy.unstack('__placeholder__')
if isinstance(unstacked, Series):
unstcols = unstacked.index
else:
unstcols = unstacked.columns
new_levels = [unstcols.levels[0]] + clevels
new_names = [data.columns.name] + cnames
new_labels = [unstcols.labels[0]]
for rec in recons_labels:
new_labels.append(rec.take(unstcols.labels[-1]))
new_columns = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)
if isinstance(unstacked, Series):
unstacked.index = new_columns
else:
unstacked.columns = new_columns
return unstacked
[docs]def pivot(self, index=None, columns=None, values=None):
"""
See DataFrame.pivot
"""
if values is None:
cols = [columns] if index is None else [index, columns]
append = index is None
indexed = self.set_index(cols, append=append)
return indexed.unstack(columns)
else:
if index is None:
index = self.index
else:
index = self[index]
indexed = Series(self[values].values,
index=MultiIndex.from_arrays([index, self[columns]]))
return indexed.unstack(columns)
def pivot_simple(index, columns, values):
"""
Produce 'pivot' table based on 3 columns of this DataFrame.
Uses unique values from index / columns and fills with values.
Parameters
----------
index : ndarray
Labels to use to make new frame's index
columns : ndarray
Labels to use to make new frame's columns
values : ndarray
Values to use for populating new frame's values
Notes
-----
Obviously, all 3 of the input arguments must have the same length
Returns
-------
DataFrame
"""
if (len(index) != len(columns)) or (len(columns) != len(values)):
raise AssertionError('Length of index, columns, and values must be the'
' same')
if len(index) == 0:
return DataFrame(index=[])
hindex = MultiIndex.from_arrays([index, columns])
series = Series(values.ravel(), index=hindex)
series = series.sortlevel(0)
return series.unstack()
def _slow_pivot(index, columns, values):
"""
Produce 'pivot' table based on 3 columns of this DataFrame.
Uses unique values from index / columns and fills with values.
Parameters
----------
index : string or object
Column name to use to make new frame's index
columns : string or object
Column name to use to make new frame's columns
values : string or object
Column name to use for populating new frame's values
Could benefit from some Cython here.
"""
tree = {}
for i, (idx, col) in enumerate(zip(index, columns)):
if col not in tree:
tree[col] = {}
branch = tree[col]
branch[idx] = values[i]
return DataFrame(tree)
def unstack(obj, level, fill_value=None):
if isinstance(level, (tuple, list)):
return _unstack_multiple(obj, level)
if isinstance(obj, DataFrame):
if isinstance(obj.index, MultiIndex):
return _unstack_frame(obj, level, fill_value=fill_value)
else:
return obj.T.stack(dropna=False)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
fill_value=fill_value)
return unstacker.get_result()
def _unstack_frame(obj, level, fill_value=None):
from pandas.core.internals import BlockManager, make_block
if obj._is_mixed_type:
unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy
obj.index, level=level,
value_columns=obj.columns)
new_columns = unstacker.get_new_columns()
new_index = unstacker.get_new_index()
new_axes = [new_columns, new_index]
new_blocks = []
mask_blocks = []
for blk in obj._data.blocks:
blk_items = obj._data.items[blk.mgr_locs.indexer]
bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
value_columns=blk_items,
fill_value=fill_value)
new_items = bunstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = bunstacker.get_new_values()
mblk = make_block(mask.T, placement=new_placement)
mask_blocks.append(mblk)
newb = make_block(new_values.T, placement=new_placement)
new_blocks.append(newb)
result = DataFrame(BlockManager(new_blocks, new_axes))
mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
return result.ix[:, mask_frame.sum(0) > 0]
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
value_columns=obj.columns,
fill_value=fill_value)
return unstacker.get_result()
def get_compressed_ids(labels, sizes):
from pandas.core.groupby import get_group_index
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return _compress_group_index(ids, sort=True)
def stack(frame, level=-1, dropna=True):
"""
Convert DataFrame to Series with multi-level Index. Columns become the
second level of the resulting hierarchical index
Returns
-------
stacked : Series
"""
def factorize(index):
if index.is_unique:
return index, np.arange(len(index))
cat = Categorical(index, ordered=True)
return cat.categories, cat.codes
N, K = frame.shape
if isinstance(frame.columns, MultiIndex):
if frame.columns._reference_duplicate_name(level):
msg = ("Ambiguous reference to {0}. The column "
"names are not unique.".format(level))
raise ValueError(msg)
# Will also convert negative level numbers and check if out of bounds.
level_num = frame.columns._get_level_number(level)
if isinstance(frame.columns, MultiIndex):
return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
elif isinstance(frame.index, MultiIndex):
new_levels = list(frame.index.levels)
new_labels = [lab.repeat(K) for lab in frame.index.labels]
clev, clab = factorize(frame.columns)
new_levels.append(clev)
new_labels.append(np.tile(clab, N).ravel())
new_names = list(frame.index.names)
new_names.append(frame.columns.name)
new_index = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)
else:
levels, (ilab, clab) = zip(*map(factorize, (frame.index,
frame.columns)))
labels = ilab.repeat(K), np.tile(clab, N).ravel()
new_index = MultiIndex(levels=levels, labels=labels,
names=[frame.index.name, frame.columns.name],
verify_integrity=False)
new_values = frame.values.ravel()
if dropna:
mask = notnull(new_values)
new_values = new_values[mask]
new_index = new_index[mask]
return Series(new_values, index=new_index)
def stack_multiple(frame, level, dropna=True):
# If all passed levels match up to column names, no
# ambiguity about what to do
if all(lev in frame.columns.names for lev in level):
result = frame
for lev in level:
result = stack(result, lev, dropna=dropna)
# Otherwise, level numbers may change as each successive level is stacked
elif all(isinstance(lev, int) for lev in level):
# As each stack is done, the level numbers decrease, so we need
# to account for that when level is a sequence of ints
result = frame
# _get_level_number() checks level numbers are in range and converts
# negative numbers to positive
level = [frame.columns._get_level_number(lev) for lev in level]
# Can't iterate directly through level as we might need to change
# values as we go
for index in range(len(level)):
lev = level[index]
result = stack(result, lev, dropna=dropna)
# Decrement all level numbers greater than current, as these
# have now shifted down by one
updated_level = []
for other in level:
if other > lev:
updated_level.append(other - 1)
else:
updated_level.append(other)
level = updated_level
else:
raise ValueError("level should contain all level names or all level "
"numbers, not a mixture of the two.")
return result
def _stack_multi_columns(frame, level_num=-1, dropna=True):
def _convert_level_number(level_num, columns):
"""
Logic for converting the level number to something we can safely pass
to swaplevel:
We generally want to convert the level number into a level name, except
when columns do not have names, in which case we must leave as a level
number
"""
if level_num in columns.names:
return columns.names[level_num]
else:
if columns.names[level_num] is None:
return level_num
else:
return columns.names[level_num]
this = frame.copy()
# this makes life much simpler
if level_num != frame.columns.nlevels - 1:
# roll levels to put selected level at end
roll_columns = this.columns
for i in range(level_num, frame.columns.nlevels - 1):
# Need to check if the ints conflict with level names
lev1 = _convert_level_number(i, roll_columns)
lev2 = _convert_level_number(i + 1, roll_columns)
roll_columns = roll_columns.swaplevel(lev1, lev2)
this.columns = roll_columns
if not this.columns.is_lexsorted():
# Workaround the edge case where 0 is one of the column names,
# which interferes with trying to sort based on the first
# level
level_to_sort = _convert_level_number(0, this.columns)
this = this.sortlevel(level_to_sort, axis=1)
# tuple list excluding level for grouping columns
if len(frame.columns.levels) > 2:
tuples = list(zip(*[lev.take(lab)
for lev, lab in zip(this.columns.levels[:-1],
this.columns.labels[:-1])]))
unique_groups = [key for key, _ in itertools.groupby(tuples)]
new_names = this.columns.names[:-1]
new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
else:
new_columns = unique_groups = this.columns.levels[0]
# time to ravel the values
new_data = {}
level_vals = this.columns.levels[-1]
level_labels = sorted(set(this.columns.labels[-1]))
level_vals_used = level_vals[level_labels]
levsize = len(level_labels)
drop_cols = []
for key in unique_groups:
loc = this.columns.get_loc(key)
slice_len = loc.stop - loc.start
# can make more efficient?
if slice_len == 0:
drop_cols.append(key)
continue
elif slice_len != levsize:
chunk = this.ix[:, this.columns[loc]]
chunk.columns = level_vals.take(chunk.columns.labels[-1])
value_slice = chunk.reindex(columns=level_vals_used).values
else:
if frame._is_mixed_type:
value_slice = this.ix[:, this.columns[loc]].values
else:
value_slice = this.values[:, loc]
new_data[key] = value_slice.ravel()
if len(drop_cols) > 0:
new_columns = new_columns.difference(drop_cols)
N = len(this)
if isinstance(this.index, MultiIndex):
new_levels = list(this.index.levels)
new_names = list(this.index.names)
new_labels = [lab.repeat(levsize) for lab in this.index.labels]
else:
new_levels = [this.index]
new_labels = [np.arange(N).repeat(levsize)]
new_names = [this.index.name] # something better?
new_levels.append(frame.columns.levels[level_num])
new_labels.append(np.tile(level_labels, N))
new_names.append(frame.columns.names[level_num])
new_index = MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)
result = DataFrame(new_data, index=new_index, columns=new_columns)
# more efficient way to go about this? can do the whole masking biz but
# will only save a small amount of time...
if dropna:
result = result.dropna(axis=0, how='all')
return result
[docs]def melt(frame, id_vars=None, value_vars=None, var_name=None,
value_name='value', col_level=None):
"""
"Unpivots" a DataFrame from wide format to long format, optionally leaving
identifier variables set.
This function is useful to massage a DataFrame into a format where one
or more columns are identifier variables (`id_vars`), while all other
columns, considered measured variables (`value_vars`), are "unpivoted" to
the row axis, leaving just two non-identifier columns, 'variable' and
'value'.
Parameters
----------
frame : DataFrame
id_vars : tuple, list, or ndarray, optional
Column(s) to use as identifier variables.
value_vars : tuple, list, or ndarray, optional
Column(s) to unpivot. If not specified, uses all columns that
are not set as `id_vars`.
var_name : scalar
Name to use for the 'variable' column. If None it uses
``frame.columns.name`` or 'variable'.
value_name : scalar, default 'value'
Name to use for the 'value' column.
col_level : int or string, optional
If columns are a MultiIndex then use this level to melt.
See also
--------
pivot_table
DataFrame.pivot
Examples
--------
>>> import pandas as pd
>>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
... 'B': {0: 1, 1: 3, 2: 5},
... 'C': {0: 2, 1: 4, 2: 6}})
>>> df
A B C
0 a 1 2
1 b 3 4
2 c 5 6
>>> pd.melt(df, id_vars=['A'], value_vars=['B'])
A variable value
0 a B 1
1 b B 3
2 c B 5
>>> pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])
A variable value
0 a B 1
1 b B 3
2 c B 5
3 a C 2
4 b C 4
5 c C 6
The names of 'variable' and 'value' columns can be customized:
>>> pd.melt(df, id_vars=['A'], value_vars=['B'],
... var_name='myVarname', value_name='myValname')
A myVarname myValname
0 a B 1
1 b B 3
2 c B 5
If you have multi-index columns:
>>> df.columns = [list('ABC'), list('DEF')]
>>> df
A B C
D E F
0 a 1 2
1 b 3 4
2 c 5 6
>>> pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B'])
A variable value
0 a B 1
1 b B 3
2 c B 5
>>> pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')])
(A, D) variable_0 variable_1 value
0 a B E 1
1 b B E 3
2 c B E 5
"""
# TODO: what about the existing index?
if id_vars is not None:
if not isinstance(id_vars, (tuple, list, np.ndarray)):
id_vars = [id_vars]
else:
id_vars = list(id_vars)
else:
id_vars = []
if value_vars is not None:
if not isinstance(value_vars, (tuple, list, np.ndarray)):
value_vars = [value_vars]
frame = frame.ix[:, id_vars + value_vars]
else:
frame = frame.copy()
if col_level is not None: # allow list or other?
# frame is a copy
frame.columns = frame.columns.get_level_values(col_level)
if var_name is None:
if isinstance(frame.columns, MultiIndex):
if len(frame.columns.names) == len(set(frame.columns.names)):
var_name = frame.columns.names
else:
var_name = ['variable_%s' % i
for i in range(len(frame.columns.names))]
else:
var_name = [frame.columns.name if frame.columns.name is not None
else 'variable']
if isinstance(var_name, compat.string_types):
var_name = [var_name]
N, K = frame.shape
K -= len(id_vars)
mdata = {}
for col in id_vars:
mdata[col] = np.tile(frame.pop(col).values, K)
mcolumns = id_vars + var_name + [value_name]
mdata[value_name] = frame.values.ravel('F')
for i, col in enumerate(var_name):
# asanyarray will keep the columns as an Index
mdata[col] = np.asanyarray(frame.columns.get_level_values(i)).repeat(N)
return DataFrame(mdata, columns=mcolumns)
def lreshape(data, groups, dropna=True, label=None):
"""
Reshape long-format data to wide. Generalized inverse of DataFrame.pivot
Parameters
----------
data : DataFrame
groups : dict
{new_name : list_of_columns}
dropna : boolean, default True
Examples
--------
>>> import pandas as pd
>>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
... 'team': ['Red Sox', 'Yankees'],
... 'year1': [2007, 2008], 'year2': [2008, 2008]})
>>> data
hr1 hr2 team year1 year2
0 514 545 Red Sox 2007 2008
1 573 526 Yankees 2007 2008
>>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
team hr year
0 Red Sox 514 2007
1 Yankees 573 2007
2 Red Sox 545 2008
3 Yankees 526 2008
Returns
-------
reshaped : DataFrame
"""
if isinstance(groups, dict):
keys = list(groups.keys())
values = list(groups.values())
else:
keys, values = zip(*groups)
all_cols = list(set.union(*[set(x) for x in values]))
id_cols = list(data.columns.difference(all_cols))
K = len(values[0])
for seq in values:
if len(seq) != K:
raise ValueError('All column lists must be same length')
mdata = {}
pivot_cols = []
for target, names in zip(keys, values):
to_concat = [data[col].values for col in names]
mdata[target] = _concat._concat_compat(to_concat)
pivot_cols.append(target)
for col in id_cols:
mdata[col] = np.tile(data[col].values, K)
if dropna:
mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
for c in pivot_cols:
mask &= notnull(mdata[c])
if not mask.all():
mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata))
return DataFrame(mdata, columns=id_cols + pivot_cols)
def wide_to_long(df, stubnames, i, j):
"""
Wide panel to long format. Less flexible but more user-friendly than melt.
Parameters
----------
df : DataFrame
The wide-format DataFrame
stubnames : list
A list of stub names. The wide format variables are assumed to
start with the stub names.
i : str
The name of the id variable.
j : str
The name of the subobservation variable.
stubend : str
Regex to match for the end of the stubs.
Returns
-------
DataFrame
A DataFrame that contains each stub name as a variable as well as
variables for i and j.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> np.random.seed(123)
>>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
... "X" : dict(zip(range(3), np.random.randn(3)))
... })
>>> df["id"] = df.index
>>> df
A1970 A1980 B1970 B1980 X id
0 a d 2.5 3.2 -1.085631 0
1 b e 1.2 1.3 0.997345 1
2 c f 0.7 0.1 0.282978 2
>>> wide_to_long(df, ["A", "B"], i="id", j="year")
X A B
id year
0 1970 -1.085631 a 2.5
1 1970 0.997345 b 1.2
2 1970 0.282978 c 0.7
0 1980 -1.085631 d 3.2
1 1980 0.997345 e 1.3
2 1980 0.282978 f 0.1
Notes
-----
All extra variables are treated as extra id variables. This simply uses
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
in a typicaly case.
"""
def get_var_names(df, regex):
return df.filter(regex=regex).columns.tolist()
def melt_stub(df, stub, i, j):
varnames = get_var_names(df, "^" + stub)
newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub,
var_name=j)
newdf_j = newdf[j].str.replace(stub, "")
try:
newdf_j = newdf_j.astype(int)
except ValueError:
pass
newdf[j] = newdf_j
return newdf
id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
if i not in id_vars:
id_vars += [i]
newdf = melt_stub(df, stubnames[0], id_vars, j)
for stub in stubnames[1:]:
new = melt_stub(df, stub, id_vars, j)
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
return newdf.set_index([i, j])
[docs]def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
columns=None, sparse=False, drop_first=False):
"""
Convert categorical variable into dummy/indicator variables
Parameters
----------
data : array-like, Series, or DataFrame
prefix : string, list of strings, or dict of strings, default None
String to append DataFrame column names
Pass a list with length equal to the number of columns
when calling get_dummies on a DataFrame. Alternativly, `prefix`
can be a dictionary mapping column names to prefixes.
prefix_sep : string, default '_'
If appending prefix, separator/delimiter to use. Or pass a
list or dictionary as with `prefix.`
dummy_na : bool, default False
Add a column to indicate NaNs, if False NaNs are ignored.
columns : list-like, default None
Column names in the DataFrame to be encoded.
If `columns` is None then all the columns with
`object` or `category` dtype will be converted.
sparse : bool, default False
Whether the dummy columns should be sparse or not. Returns
SparseDataFrame if `data` is a Series or if all columns are included.
Otherwise returns a DataFrame with some SparseBlocks.
.. versionadded:: 0.16.1
drop_first : bool, default False
Whether to get k-1 dummies out of n categorical levels by removing the
first level.
.. versionadded:: 0.18.0
Returns
-------
dummies : DataFrame or SparseDataFrame
Examples
--------
>>> import pandas as pd
>>> s = pd.Series(list('abca'))
>>> pd.get_dummies(s)
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
>>> s1 = ['a', 'b', np.nan]
>>> pd.get_dummies(s1)
a b
0 1 0
1 0 1
2 0 0
>>> pd.get_dummies(s1, dummy_na=True)
a b NaN
0 1 0 0
1 0 1 0
2 0 0 1
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
'C': [1, 2, 3]})
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
C col1_a col1_b col2_a col2_b col2_c
0 1 1 0 0 1 0
1 2 0 1 1 0 0
2 3 1 0 0 0 1
>>> pd.get_dummies(pd.Series(list('abcaa')))
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
4 1 0 0
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
b c
0 0 0
1 1 0
2 0 1
3 0 0
4 0 0
See Also
--------
Series.str.get_dummies
"""
from pandas.tools.merge import concat
from itertools import cycle
if isinstance(data, DataFrame):
# determine columns being encoded
if columns is None:
columns_to_encode = data.select_dtypes(
include=['object', 'category']).columns
else:
columns_to_encode = columns
# validate prefixes and separator to avoid silently dropping cols
def check_len(item, name):
length_msg = ("Length of '{0}' ({1}) did not match the length of "
"the columns being encoded ({2}).")
if is_list_like(item):
if not len(item) == len(columns_to_encode):
raise ValueError(length_msg.format(name, len(item),
len(columns_to_encode)))
check_len(prefix, 'prefix')
check_len(prefix_sep, 'prefix_sep')
if isinstance(prefix, compat.string_types):
prefix = cycle([prefix])
if isinstance(prefix, dict):
prefix = [prefix[col] for col in columns_to_encode]
if prefix is None:
prefix = columns_to_encode
# validate separators
if isinstance(prefix_sep, compat.string_types):
prefix_sep = cycle([prefix_sep])
elif isinstance(prefix_sep, dict):
prefix_sep = [prefix_sep[col] for col in columns_to_encode]
if set(columns_to_encode) == set(data.columns):
with_dummies = []
else:
with_dummies = [data.drop(columns_to_encode, axis=1)]
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
dummy_na=dummy_na, sparse=sparse,
drop_first=drop_first)
with_dummies.append(dummy)
result = concat(with_dummies, axis=1)
else:
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
sparse=sparse, drop_first=drop_first)
return result
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
sparse=False, drop_first=False):
# Series avoids inconsistent NaN handling
cat = Categorical.from_array(Series(data), ordered=True)
levels = cat.categories
def get_empty_Frame(data, sparse):
if isinstance(data, Series):
index = data.index
else:
index = np.arange(len(data))
if not sparse:
return DataFrame(index=index)
else:
return SparseDataFrame(index=index)
# if all NaN
if not dummy_na and len(levels) == 0:
return get_empty_Frame(data, sparse)
codes = cat.codes.copy()
if dummy_na:
codes[codes == -1] = len(cat.categories)
levels = np.append(cat.categories, np.nan)
# if dummy_na, we just fake a nan level. drop_first will drop it again
if drop_first and len(levels) == 1:
return get_empty_Frame(data, sparse)
number_of_cols = len(levels)
if prefix is not None:
dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels]
else:
dummy_cols = levels
if isinstance(data, Series):
index = data.index
else:
index = None
if sparse:
sparse_series = {}
N = len(data)
sp_indices = [[] for _ in range(len(dummy_cols))]
for ndx, code in enumerate(codes):
if code == -1:
# Blank entries if not dummy_na and code == -1, #GH4446
continue
sp_indices[code].append(ndx)
if drop_first:
# remove first categorical level to avoid perfect collinearity
# GH12042
sp_indices = sp_indices[1:]
dummy_cols = dummy_cols[1:]
for col, ixs in zip(dummy_cols, sp_indices):
sarr = SparseArray(np.ones(len(ixs)),
sparse_index=IntIndex(N, ixs), fill_value=0)
sparse_series[col] = SparseSeries(data=sarr, index=index)
return SparseDataFrame(sparse_series, index=index, columns=dummy_cols)
else:
dummy_mat = np.eye(number_of_cols).take(codes, axis=0)
if not dummy_na:
# reset NaN GH4446
dummy_mat[codes == -1] = 0
if drop_first:
# remove first GH12042
dummy_mat = dummy_mat[:, 1:]
dummy_cols = dummy_cols[1:]
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def make_axis_dummies(frame, axis='minor', transform=None):
"""
Construct 1-0 dummy variables corresponding to designated axis
labels
Parameters
----------
frame : DataFrame
axis : {'major', 'minor'}, default 'minor'
transform : function, default None
Function to apply to axis labels first. For example, to
get "day of week" dummies in a time series regression
you might call::
make_axis_dummies(panel, axis='major',
transform=lambda d: d.weekday())
Returns
-------
dummies : DataFrame
Column names taken from chosen axis
"""
numbers = {'major': 0, 'minor': 1}
num = numbers.get(axis, axis)
items = frame.index.levels[num]
labels = frame.index.labels[num]
if transform is not None:
mapped_items = items.map(transform)
cat = Categorical.from_array(mapped_items.take(labels), ordered=True)
labels = cat.codes
items = cat.categories
values = np.eye(len(items), dtype=float)
values = values.take(labels, axis=0)
return DataFrame(values, columns=items, index=frame.index)