Skip to content

Commit

Permalink
ENH: can pass external arrays in lieu of column names to DataFrame.se…
Browse files Browse the repository at this point in the history
…t_index, close #402
  • Loading branch information
wesm committed May 7, 2012
1 parent 4c31c83 commit 553bacc
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 73 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ pandas 0.8.0
e.g. Series.describe and DataFrame.describe (GH #1092)
- Can create MultiIndex by passing list of lists or list of arrays to Series,
DataFrame constructor, etc. (#831)
- Can pass arrays in addition to column names to DataFrame.set_index (#402)

**API Changes**

Expand Down
32 changes: 20 additions & 12 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2069,31 +2069,36 @@ def reindex_like(self, other, method=None, copy=True, limit=None):

truncate = generic.truncate

def set_index(self, col_or_cols, drop=True, inplace=False,
verify_integrity=True):
def set_index(self, keys, drop=True, inplace=False,
verify_integrity=False):
"""
Set the DataFrame index (row labels) using one or more existing
columns. By default yields a new object.
Parameters
----------
col_or_cols : column label or list of column labels
keys : column label or list of column labels / arrays
drop : boolean, default True
Delete columns to be used as the new index
inplace : boolean, default False
Modify the DataFrame in place (do not create a new object)
verify_integrity : boolean, default True
verify_integrity : boolean, default False
Check the new index for duplicates. Otherwise defer the check until
necessary. Setting to False will improve the performance of this
method
Examples
--------
indexed_df = df.set_index(['A', 'B'])
indexed_df2 = df.set_index(['A', [0, 1, 2, 0, 1, 2]])
indexed_df3 = df.set_index([[0, 1, 2, 0, 1, 2]])
Returns
-------
dataframe : DataFrame
"""
cols = col_or_cols
if not isinstance(col_or_cols, (list, tuple)):
cols = [col_or_cols]
if not isinstance(keys, (list, tuple)):
keys = [keys]

if inplace:
frame = self
Expand All @@ -2102,13 +2107,16 @@ def set_index(self, col_or_cols, drop=True, inplace=False,
frame = self.copy()

arrays = []
for col in cols:
level = frame[col]
if drop:
del frame[col]
for col in keys:
if isinstance(col, (list, Series, np.ndarray)):
level = col
else:
level = frame[col]
if drop:
del frame[col]
arrays.append(level)

index = MultiIndex.from_arrays(arrays, names=cols)
index = MultiIndex.from_arrays(arrays, names=keys)

if verify_integrity and not index.is_unique:
duplicates = index.get_duplicates()
Expand Down
2 changes: 1 addition & 1 deletion pandas/src/tseries.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def fast_zip(list ndarrays):
arr = ndarrays[j]
it = <flatiter> PyArray_IterNew(arr)
if len(arr) != n:
raise ValueError('all arrays but be same length')
raise ValueError('all arrays must be same length')

for i in range(n):
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
Expand Down
133 changes: 73 additions & 60 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1235,6 +1235,79 @@ def test_set_index(self):
self.assertRaises(Exception, setattr, self.mixed_frame, 'index',
idx[::2])

def test_set_index2(self):
df = DataFrame({'A' : ['foo', 'foo', 'foo', 'bar', 'bar'],
'B' : ['one', 'two', 'three', 'one', 'two'],
'C' : ['a', 'b', 'c', 'd', 'e'],
'D' : np.random.randn(5),
'E' : np.random.randn(5)})

# new object, single-column
result = df.set_index('C')
result_nodrop = df.set_index('C', drop=False)

index = Index(df['C'], name='C')

expected = df.ix[:, ['A', 'B', 'D', 'E']]
expected.index = index

expected_nodrop = df.copy()
expected_nodrop.index = index

assert_frame_equal(result, expected)
assert_frame_equal(result_nodrop, expected_nodrop)
self.assertEqual(result.index.name, index.name)

# inplace, single
df2 = df.copy()
df2.set_index('C', inplace=True)
assert_frame_equal(df2, expected)

df3 = df.copy()
df3.set_index('C', drop=False, inplace=True)
assert_frame_equal(df3, expected_nodrop)

# create new object, multi-column
result = df.set_index(['A', 'B'])
result_nodrop = df.set_index(['A', 'B'], drop=False)

index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B'])

expected = df.ix[:, ['C', 'D', 'E']]
expected.index = index

expected_nodrop = df.copy()
expected_nodrop.index = index

assert_frame_equal(result, expected)
assert_frame_equal(result_nodrop, expected_nodrop)
self.assertEqual(result.index.names, index.names)

# inplace
df2 = df.copy()
df2.set_index(['A', 'B'], inplace=True)
assert_frame_equal(df2, expected)

df3 = df.copy()
df3.set_index(['A', 'B'], drop=False, inplace=True)
assert_frame_equal(df3, expected_nodrop)

# corner case
self.assertRaises(Exception, df.set_index, 'A', verify_integrity=True)

def test_set_index_pass_arrays(self):
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})

# multiple columns
result = df.set_index(['A', df['B'].values], drop=False)
expected = df.set_index(['A', 'B'], drop=False)
assert_frame_equal(result, expected)

def test_set_columns(self):
cols = Index(np.arange(len(self.mixed_frame.columns)))
self.mixed_frame.columns = cols
Expand Down Expand Up @@ -3502,66 +3575,6 @@ def test_reindex_fill_value(self):
expected = df.reindex(range(15)).fillna(0)
assert_frame_equal(result, expected)

def test_set_index2(self):
df = DataFrame({'A' : ['foo', 'foo', 'foo', 'bar', 'bar'],
'B' : ['one', 'two', 'three', 'one', 'two'],
'C' : ['a', 'b', 'c', 'd', 'e'],
'D' : np.random.randn(5),
'E' : np.random.randn(5)})

# new object, single-column
result = df.set_index('C')
result_nodrop = df.set_index('C', drop=False)

index = Index(df['C'], name='C')

expected = df.ix[:, ['A', 'B', 'D', 'E']]
expected.index = index

expected_nodrop = df.copy()
expected_nodrop.index = index

assert_frame_equal(result, expected)
assert_frame_equal(result_nodrop, expected_nodrop)
self.assertEqual(result.index.name, index.name)

# inplace, single
df2 = df.copy()
df2.set_index('C', inplace=True)
assert_frame_equal(df2, expected)

df3 = df.copy()
df3.set_index('C', drop=False, inplace=True)
assert_frame_equal(df3, expected_nodrop)

# create new object, multi-column
result = df.set_index(['A', 'B'])
result_nodrop = df.set_index(['A', 'B'], drop=False)

index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B'])

expected = df.ix[:, ['C', 'D', 'E']]
expected.index = index

expected_nodrop = df.copy()
expected_nodrop.index = index

assert_frame_equal(result, expected)
assert_frame_equal(result_nodrop, expected_nodrop)
self.assertEqual(result.index.names, index.names)

# inplace
df2 = df.copy()
df2.set_index(['A', 'B'], inplace=True)
assert_frame_equal(df2, expected)

df3 = df.copy()
df3.set_index(['A', 'B'], drop=False, inplace=True)
assert_frame_equal(df3, expected_nodrop)

# corner case
self.assertRaises(Exception, df.set_index, 'A')

def test_align(self):
af, bf = self.frame.align(self.frame)
self.assert_(af._data is not self.frame._data)
Expand Down

0 comments on commit 553bacc

Please sign in to comment.