Skip to content

Commit

Permalink
ENH: refactoring to support ordered factors, cut/qcut return factors. #…
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Jun 5, 2012
1 parent a693e7b commit 4655828
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 40 deletions.
9 changes: 6 additions & 3 deletions pandas/core/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def from_array(cls, data):

levels = None

def __array__(self):
return self.levels.values.take(self.labels)
def __array__(self, dtype=None):
return com.take_1d(self.levels, self.labels)

def __len__(self):
return len(self.labels)
Expand All @@ -58,7 +58,10 @@ def __repr__(self):
def __getitem__(self, key):
if isinstance(key, (int, np.integer)):
i = self.labels[key]
return self.levels[i]
if i == -1:
return np.nan
else:
return self.levels[i]
else:
return Factor(self.labels[key], self.levels)

Expand Down
56 changes: 28 additions & 28 deletions pandas/tools/tests/test_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,26 @@ def test_simple(self):

def test_bins(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1])
result, bins = cut(data, 3, labels=False, retbins=True)
assert_equal(result, [1, 1, 1, 2, 3, 1])
result, bins = cut(data, 3, retbins=True)
assert_equal(result.labels, [0, 0, 0, 1, 2, 0])
assert_almost_equal(bins, [ 0.1905, 3.36666667, 6.53333333, 9.7])

def test_right(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=True, labels=False, retbins=True)
assert_equal(result, [1, 1, 1, 3, 4, 1, 1])
result, bins = cut(data, 4, right=True, retbins=True)
assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 0])
assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7])

def test_noright(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=False, labels=False, retbins=True)
assert_equal(result, [1, 1, 1, 3, 4, 1, 2])
result, bins = cut(data, 4, right=False, retbins=True)
assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 1])
assert_almost_equal(bins, [ 0.2, 2.575, 4.95, 7.325, 9.7095])

def test_arraylike(self):
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
result, bins = cut(data, 3, labels=False, retbins=True)
assert_equal(result, [1, 1, 1, 2, 3, 1])
result, bins = cut(data, 3, retbins=True)
assert_equal(result.labels, [0, 0, 0, 1, 2, 0])
assert_almost_equal(bins, [ 0.1905, 3.36666667, 6.53333333, 9.7])

def test_bins_not_monotonic(self):
Expand All @@ -51,39 +51,39 @@ def test_bins_not_monotonic(self):
def test_labels(self):
arr = np.tile(np.arange(0, 1.01, 0.1), 4)

labels, bins = cut(arr, 4, retbins=True)
distinct_labels = sorted(unique(labels))
ex_labels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]',
result, bins = cut(arr, 4, retbins=True)
ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]',
'(0.75, 1]']
self.assertEqual(distinct_labels, ex_labels)
self.assert_(np.array_equal(result.levels, ex_levels))

labels, bins = cut(arr, 4, retbins=True, right=False)
distinct_labels = sorted(unique(labels))
ex_labels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)',
result, bins = cut(arr, 4, retbins=True, right=False)
ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)',
'[0.75, 1.001)']
self.assertEqual(distinct_labels, ex_labels)
self.assert_(np.array_equal(result.levels, ex_levels))

def test_label_precision(self):
arr = np.arange(0, 0.73, 0.01)

labels = cut(arr, 4, precision=2)
distinct_labels = sorted(unique(labels))
ex_labels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]',
result = cut(arr, 4, precision=2)
ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]',
'(0.54, 0.72]']
self.assertEqual(distinct_labels, ex_labels)
self.assert_(np.array_equal(result.levels, ex_levels))

def test_na_handling(self):
arr = np.arange(0, 0.75, 0.01)
arr[::3] = np.nan

labels = cut(arr, 4)
ex_labels = np.where(com.isnull(arr), np.nan, labels)
result = cut(arr, 4)

tm.assert_almost_equal(labels, ex_labels)
result_arr = np.asarray(result)

labels = cut(arr, 4, labels=False)
ex_labels = np.where(com.isnull(arr), np.nan, labels)
tm.assert_almost_equal(labels, ex_labels)
ex_arr = np.where(com.isnull(arr), np.nan, result_arr)

tm.assert_almost_equal(result_arr, ex_arr)

result = cut(arr, 4, labels=False)
ex_result = np.where(com.isnull(arr), np.nan, result)
tm.assert_almost_equal(result, ex_result)

def test_qcut(self):
arr = np.random.randn(1000)
Expand All @@ -94,9 +94,9 @@ def test_qcut(self):

assert_almost_equal(bins, ex_bins)

ex_labels = cut(arr, ex_bins)
ex_levels = cut(arr, ex_bins)

self.assert_(np.array_equal(labels, ex_labels))
self.assert_(np.array_equal(labels, ex_levels))


if __name__ == '__main__':
Expand Down
18 changes: 9 additions & 9 deletions pandas/tools/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,28 +153,28 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,

fmt = lambda v: _format_label(v, precision=precision)
if right:
strings = ['(%s, %s]' % (fmt(x), fmt(y))
levels = ['(%s, %s]' % (fmt(x), fmt(y))
for x, y in zip(labels, labels[1:])]
else:
strings = ['[%s, %s)' % (fmt(x), fmt(y))
levels = ['[%s, %s)' % (fmt(x), fmt(y))
for x, y in zip(labels, labels[1:])]

strings = np.asarray(strings, dtype=object)
levels = np.asarray(levels, dtype=object)

if has_nas:
np.putmask(ids, mask, 0)

labels = com.take_1d(strings, ids - 1)
fac = Factor(ids - 1, levels)
else:
labels = ids
fac = ids
if has_nas:
labels = labels.astype(np.float64)
np.putmask(labels, mask, np.nan)
fac = ids.astype(np.float64)
np.putmask(fac, mask, np.nan)

if not retbins:
return labels
return fac

return labels, bins
return fac, bins


def _format_label(x, precision=3):
Expand Down

0 comments on commit 4655828

Please sign in to comment.