Skip to content

Commit

Permalink
ENH: implement qcut for quantile cuts, fix 32-bit build close #1378
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Jun 5, 2012
1 parent 6e46099 commit 3e904fd
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 38 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ pandas 0.8.0
- Add Panel.transpose method for rearranging axes (#695)
- Add new ``cut`` function (patterned after R) for discretizing data into
equal range-length bins or arbitrary breaks of your choosing (#415)
- Add new ``qcut`` for cutting with quantiles (#1378)
- Added Andrews curves plot tupe (#1325)
- Add support for tox and Travis CI (#1382)

Expand Down
72 changes: 72 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pandas.core.common as com
import pandas.lib as lib
import pandas._algos as _algos

def match(to_match, values, na_sentinel=-1):
"""
Expand Down Expand Up @@ -179,6 +180,77 @@ def rank(values, axis=0, method='average', na_option='keep',
ascending=ascending)
return ranks

def quantile(x, q, interpolation_method='fraction'):
"""
Compute sample quantile or quantiles of the input array. For example, q=0.5
computes the median.
The `interpolation_method` parameter supports three values, namely
`fraction` (default), `lower` and `higher`. Interpolation is done only,
if the desired quantile lies between two data points `i` and `j`. For
`fraction`, the result is an interpolated value between `i` and `j`;
for `lower`, the result is `i`, for `higher` the result is `j`.
Parameters
----------
a : ndarray
Values from which to extract score.
q : scalar or array
Percentile at which to extract score.
interpolation : {'fraction', 'lower', 'higher'}, optional
This optional parameter specifies the interpolation method to use,
when the desired quantile lies between two data points `i` and `j`:
- fraction: `i + (j - i)*fraction`, where `fraction` is the
fractional part of the index surrounded by `i` and `j`.
-lower: `i`.
- higher: `j`.
Returns
-------
score : float
Score at percentile.
Examples
--------
>>> from scipy import stats
>>> a = np.arange(100)
>>> stats.scoreatpercentile(a, 50)
49.5
"""
values = np.sort(x)

def _get_score(at):
idx = at * (len(values) - 1)
if (idx % 1 == 0):
score = values[idx]
else:
if interpolation_method == 'fraction':
score = _interpolate(values[int(idx)], values[int(idx) + 1],
idx % 1)
elif interpolation_method == 'lower':
score = values[np.floor(idx)]
elif interpolation_method == 'higher':
score = values[np.ceil(idx)]
else:
raise ValueError("interpolation_method can only be 'fraction', " \
"'lower' or 'higher'")

return score

if np.isscalar(q):
return _get_score(q)
else:
q = np.asarray(q, np.float64)
return _algos.arrmap_float64(q, _get_score)

def _interpolate(a, b, fraction):
"""Returns the point at the given fraction between a and b, where
'fraction' must be between 0 and 1.
"""
return a + (b - a)*fraction


def _get_data_algo(values, func_map):
if com.is_float_dtype(values):
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,8 +1083,8 @@ def value_counts(self):
-------
counts : Series
"""
import pandas.core.algorithms as algos
return algos.value_counts(self.values, sort=True, ascending=False)
from pandas.core.algorithms import value_counts
return value_counts(self.values, sort=True, ascending=False)

def unique(self):
"""
Expand Down
18 changes: 14 additions & 4 deletions pandas/src/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -887,8 +887,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
result_b.fill(NPY_NAT)

# left side
idx_shifted = np.maximum(0, trans.searchsorted(vals - DAY_NS,
side='right') - 1)
idx_shifted = _ensure_int64(
np.maximum(0, trans.searchsorted(vals - DAY_NS, side='right') - 1))

for i in range(n):
v = vals[i] - deltas[idx_shifted[i]]
Expand All @@ -899,8 +899,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
result_a[i] = v

# right side
idx_shifted = np.maximum(0, trans.searchsorted(vals + DAY_NS,
side='right') - 1)
idx_shifted = _ensure_int64(
np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1))

for i in range(n):
v = vals[i] - deltas[idx_shifted[i]]
Expand Down Expand Up @@ -929,6 +929,16 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):

return result

cdef _ensure_int64(object arr):
if util.is_array(arr):
if (<ndarray> arr).descr.type_num == NPY_INT64:
return arr
else:
return arr.astype(np.int64)
else:
return np.array(arr, dtype=np.int64)


cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n):
cdef Py_ssize_t pivot, left = 0, right = n

Expand Down
17 changes: 16 additions & 1 deletion pandas/tools/tests/test_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import pandas.util.testing as tm
import pandas.core.common as com

from pandas.tools.tile import cut
from pandas.core.algorithms import quantile
from pandas.tools.tile import cut, qcut

from numpy.testing import assert_equal, assert_almost_equal

Expand Down Expand Up @@ -84,6 +85,20 @@ def test_na_handling(self):
ex_labels = np.where(com.isnull(arr), np.nan, labels)
tm.assert_almost_equal(labels, ex_labels)

def test_qcut(self):
arr = np.random.randn(1000)

labels, bins = qcut(arr, 4, retbins=True)

ex_bins = quantile(arr, [0, .25, .5, .75, 1.])

assert_almost_equal(bins, ex_bins)

ex_labels = cut(arr, ex_bins)

self.assert_(np.array_equal(labels, ex_labels))


if __name__ == '__main__':
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
exit=False)
Expand Down
75 changes: 45 additions & 30 deletions pandas/tools/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

from pandas.core.api import DataFrame, Series
import pandas.core.algorithms as algos
import pandas.core.common as com
import pandas.core.nanops as nanops

Expand Down Expand Up @@ -92,13 +93,56 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
if (np.diff(bins) < 0).any():
raise ValueError('bins must increase monotonically.')

return _bins_to_cuts(x, bins, right=right, labels=labels,
retbins=retbins, precision=precision)



def qcut(x, q=4, labels=None, retbins=False, precision=3):
"""
Quantile-based discretization function. Discretize variable into
equal-sized buckets based on rank or based on sample quantiles. For example
1000 values for 10 quantiles would produce 1000 integers from 0 to 9
indicating the
Parameters
----------
x : ndarray or Series
q : integer or array of quantiles
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
labels : array or boolean, default None
Labels to use for bin edges, or False to return integer bin labels
retbins : bool, optional
Whether to return the bins or not. Can be useful if bins is given
as a scalar.
Returns
-------
Notes
-----
Examples
--------
"""
if com.is_integer(q):
quantiles = np.linspace(0, 1, q + 1)
bins = algos.quantile(x, quantiles)
return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
precision=precision)
else:
raise NotImplementedError


def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
precision=3):
side = 'left' if right else 'right'
ids = bins.searchsorted(x, side=side)

mask = com.isnull(x)
has_nas = mask.any()


if labels is not False:
if labels is None:
labels = bins
Expand Down Expand Up @@ -132,35 +176,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
return labels, bins


def qcut(x, n, ties_method='average'):
"""
Quantile-based discretization function. Discretize variable into
equal-sized buckets based on rank. For example 1000 values for 10 quantiles
would produce 1000 integers from 0 to 9 indicating the
Parameters
----------
x : ndarray or Series
n : integer
Number of quantiles. 10 for deciles, 4 for quartiles, etc.
ties_method : {'average', 'min', 'max', 'first'}, default 'average'
average: average rank of group
min: lowest rank in group
max: highest rank in group
first: ranks assigned in order they appear in the array
Returns
-------
Notes
-----
Examples
--------
"""
pass


def _format_label(x, precision=3):
fmt_str = '%%.%dg' % precision
if com.is_float(x):
Expand Down
2 changes: 1 addition & 1 deletion scripts/count_code.sh
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c|generated.c"
cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c|generated.c|plib.c"

0 comments on commit 3e904fd

Please sign in to comment.